Spaces:
Build error
Build error
| import os | |
| import torch | |
| import numpy as np | |
| import json | |
| import re | |
| from torch.autograd import Variable | |
| def _truncate_seq_pair(tokens_a, tokens_b, max_length): | |
| """Truncates a sequence pair in place to the maximum length.""" | |
| # This is a simple heuristic which will always truncate the longer sequence | |
| # one token at a time. This makes more sense than truncating an equal percent | |
| # of tokens from each, since if one sequence is very short then each token | |
| # that's truncated likely contains more information than a longer sequence. | |
| while True: | |
| total_length = len(tokens_a) + len(tokens_b) | |
| if total_length <= max_length: | |
| break | |
| if len(tokens_a) > len(tokens_b): | |
| tokens_a.pop() | |
| else: | |
| tokens_b.pop() | |
| def tok2int_sent(sentence, tokenizer, max_seq_length): | |
| """Loads a data file into a list of `InputBatch`s.""" | |
| sent_a, sent_b = sentence | |
| tokens_a = tokenizer.tokenize(sent_a) | |
| tokens_b = None | |
| if sent_b: | |
| tokens_b = tokenizer.tokenize(sent_b) | |
| _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) | |
| else: | |
| # Account for [CLS] and [SEP] with "- 2" | |
| if len(tokens_a) > max_seq_length - 2: | |
| tokens_a = tokens_a[:(max_seq_length - 2)] | |
| tokens = ["[CLS]"] + tokens_a + ["[SEP]"] | |
| segment_ids = [0] * len(tokens) | |
| if tokens_b: | |
| tokens = tokens + tokens_b + ["[SEP]"] | |
| segment_ids += [1] * (len(tokens_b) + 1) | |
| input_ids = tokenizer.convert_tokens_to_ids(tokens) | |
| input_mask = [1] * len(input_ids) | |
| padding = [0] * (max_seq_length - len(input_ids)) | |
| input_ids += padding | |
| input_mask += padding | |
| segment_ids += padding | |
| assert len(input_ids) == max_seq_length | |
| assert len(input_mask) == max_seq_length | |
| assert len(segment_ids) == max_seq_length | |
| return input_ids, input_mask, segment_ids | |
| def tok2int_list(src_list, tokenizer, max_seq_length, max_seq_size=-1): | |
| inp_padding = list() | |
| msk_padding = list() | |
| seg_padding = list() | |
| for step, sent in enumerate(src_list): | |
| input_ids, input_mask, input_seg = tok2int_sent(sent, tokenizer, max_seq_length) | |
| inp_padding.append(input_ids) | |
| msk_padding.append(input_mask) | |
| seg_padding.append(input_seg) | |
| # if max_seq_size != -1: | |
| # inp_padding = inp_padding[:max_seq_size] | |
| # msk_padding = msk_padding[:max_seq_size] | |
| # seg_padding = seg_padding[:max_seq_size] | |
| # inp_padding += ([[0] * max_seq_length] * (max_seq_size - len(inp_padding))) | |
| # msk_padding += ([[0] * max_seq_length] * (max_seq_size - len(msk_padding))) | |
| # seg_padding += ([[0] * max_seq_length] * (max_seq_size - len(seg_padding))) | |
| return inp_padding, msk_padding, seg_padding | |
| class DataLoader(object): | |
| ''' For data iteration ''' | |
| def __init__(self, data_path, tokenizer, args, test=False, cuda=True, batch_size=64): | |
| self.cuda = cuda | |
| self.batch_size = batch_size | |
| self.tokenizer = tokenizer | |
| self.max_len = args.max_len | |
| self.evi_num = args.evi_num | |
| self.threshold = args.threshold | |
| self.data_path = data_path | |
| self.test = test | |
| examples = self.read_file(data_path) | |
| self.examples = examples | |
| self.total_num = len(examples) | |
| if self.test: | |
| self.total_num = 100000 | |
| self.total_step = np.ceil(self.total_num * 1.0 / batch_size) | |
| self.shuffle() | |
| else: | |
| self.total_step = self.total_num / batch_size | |
| self.shuffle() | |
| self.step = 0 | |
| def process_sent(self, sentence): | |
| sentence = re.sub(" \-LSB\-.*?\-RSB\-", "", sentence) | |
| sentence = re.sub("\-LRB\- \-RRB\- ", "", sentence) | |
| sentence = re.sub(" -LRB-", " ( ", sentence) | |
| sentence = re.sub("-RRB-", " )", sentence) | |
| sentence = re.sub("--", "-", sentence) | |
| sentence = re.sub("``", '"', sentence) | |
| sentence = re.sub("''", '"', sentence) | |
| return sentence | |
| def process_wiki_title(self, title): | |
| title = re.sub("_", " ", title) | |
| title = re.sub(" -LRB-", " ( ", title) | |
| title = re.sub("-RRB-", " )", title) | |
| title = re.sub("-COLON-", ":", title) | |
| return title | |
| def read_file(self, data_path): | |
| examples = list() | |
| with open(data_path) as fin: | |
| for step, line in enumerate(fin): | |
| sublines = line.strip().split("\t") | |
| examples.append( | |
| [self.process_sent(sublines[0]), self.process_sent(sublines[2]), self.process_sent(sublines[4])]) | |
| return examples | |
| def shuffle(self): | |
| np.random.shuffle(self.examples) | |
| def __iter__(self): | |
| return self | |
| def __next__(self): | |
| return self.next() | |
| def __len__(self): | |
| return self._n_batch | |
| def next(self): | |
| ''' Get the next batch ''' | |
| if self.step < self.total_step: | |
| examples = self.examples[self.step * self.batch_size: (self.step + 1) * self.batch_size] | |
| pos_inputs = list() | |
| neg_inputs = list() | |
| for example in examples: | |
| pos_inputs.append([example[0], example[1]]) | |
| neg_inputs.append([example[0], example[2]]) | |
| inp_pos, msk_pos, seg_pos = tok2int_list(pos_inputs, self.tokenizer, self.max_len) | |
| inp_neg, msk_neg, seg_neg = tok2int_list(neg_inputs, self.tokenizer, self.max_len) | |
| inp_tensor_pos = Variable( | |
| torch.LongTensor(inp_pos)) | |
| msk_tensor_pos = Variable( | |
| torch.LongTensor(msk_pos)) | |
| seg_tensor_pos = Variable( | |
| torch.LongTensor(seg_pos)) | |
| inp_tensor_neg = Variable( | |
| torch.LongTensor(inp_neg)) | |
| msk_tensor_neg = Variable( | |
| torch.LongTensor(msk_neg)) | |
| seg_tensor_neg = Variable( | |
| torch.LongTensor(seg_neg)) | |
| if self.cuda: | |
| inp_tensor_pos = inp_tensor_pos.cuda() | |
| msk_tensor_pos = msk_tensor_pos.cuda() | |
| seg_tensor_pos = seg_tensor_pos.cuda() | |
| inp_tensor_neg = inp_tensor_neg.cuda() | |
| msk_tensor_neg = msk_tensor_neg.cuda() | |
| seg_tensor_neg = seg_tensor_neg.cuda() | |
| self.step += 1 | |
| return inp_tensor_pos, msk_tensor_pos, seg_tensor_pos, inp_tensor_neg, msk_tensor_neg, seg_tensor_neg | |
| else: | |
| self.step = 0 | |
| if not self.test: | |
| # examples = self.read_file(self.data_path) | |
| # self.examples = examples | |
| self.shuffle() | |
| raise StopIteration() | |
| class DataLoaderTest(object): | |
| ''' For data iteration ''' | |
| def __init__(self, data_path, tokenizer, args, cuda=True, batch_size=64): | |
| self.cuda = cuda | |
| self.batch_size = batch_size | |
| self.tokenizer = tokenizer | |
| self.max_len = args.max_len | |
| self.evi_num = args.evi_num | |
| self.threshold = args.threshold | |
| self.data_path = data_path | |
| inputs, ids, evi_list = self.read_all(data_path) | |
| self.inputs = inputs | |
| self.ids = ids | |
| self.evi_list = evi_list | |
| self.total_num = len(inputs) | |
| self.total_step = np.ceil(self.total_num * 1.0 / batch_size) | |
| self.step = 0 | |
| def process_sent(self, sentence): | |
| sentence = re.sub(" \-LSB\-.*?\-RSB\-", "", sentence) | |
| sentence = re.sub("\-LRB\- \-RRB\- ", "", sentence) | |
| sentence = re.sub(" -LRB-", " ( ", sentence) | |
| sentence = re.sub("-RRB-", " )", sentence) | |
| sentence = re.sub("--", "-", sentence) | |
| sentence = re.sub("``", '"', sentence) | |
| sentence = re.sub("''", '"', sentence) | |
| return sentence | |
| def process_wiki_title(self, title): | |
| title = re.sub("_", " ", title) | |
| title = re.sub(" -LRB-", " ( ", title) | |
| title = re.sub("-RRB-", " )", title) | |
| title = re.sub("-COLON-", ":", title) | |
| return title | |
| def read_all(self, data): | |
| if not isinstance(data, list): | |
| with open(data) as f: | |
| data_ = [json.loads(line) for line in f] | |
| else: | |
| data_ = data | |
| inputs = list() | |
| ids = list() | |
| evi_list = list() | |
| for instance in data_: | |
| claim = instance['claim'] | |
| id = instance['id'] | |
| for evidence in instance['evidence']: | |
| ids.append(id) | |
| inputs.append([self.process_sent(claim), self.process_sent(evidence[2])]) | |
| evi_list.append(evidence) | |
| return inputs, ids, evi_list | |
| def shuffle(self): | |
| np.random.shuffle(self.examples) | |
| def __iter__(self): | |
| return self | |
| def __next__(self): | |
| return self.next() | |
| def __len__(self): | |
| return self._n_batch | |
| def next(self): | |
| ''' Get the next batch ''' | |
| if self.step < self.total_step: | |
| inputs = self.inputs[self.step * self.batch_size: (self.step + 1) * self.batch_size] | |
| ids = self.ids[self.step * self.batch_size: (self.step + 1) * self.batch_size] | |
| evi_list = self.evi_list[self.step * self.batch_size: (self.step + 1) * self.batch_size] | |
| inp, msk, seg = tok2int_list(inputs, self.tokenizer, self.max_len, -1) | |
| inp_tensor_input = Variable( | |
| torch.LongTensor(inp)) | |
| msk_tensor_input = Variable( | |
| torch.LongTensor(msk)) | |
| seg_tensor_input = Variable( | |
| torch.LongTensor(seg)) | |
| if self.cuda: | |
| inp_tensor_input = inp_tensor_input.cuda() | |
| msk_tensor_input = msk_tensor_input.cuda() | |
| seg_tensor_input = seg_tensor_input.cuda() | |
| self.step += 1 | |
| return inp_tensor_input, msk_tensor_input, seg_tensor_input, ids, evi_list | |
| else: | |
| self.step = 0 | |
| raise StopIteration() | |