Spaces:
Build error
Build error
PeteBleackley
commited on
Commit
·
4f8366b
1
Parent(s):
e149b0f
Components for managing training corpora
Browse files- qarac/corpora/CombinedCorpus.py +199 -0
- qarac/corpora/CorpusLoader.py +91 -0
- qarac/corpora/Preprocessor.py +91 -0
- qarac/models/QaracTrainerModel.py +13 -2
- qarac/utils/CoreferenceResolver.py +36 -0
qarac/corpora/CombinedCorpus.py
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
"""
|
| 4 |
+
Created on Wed Sep 20 14:12:34 2023
|
| 5 |
+
|
| 6 |
+
@author: peter
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import collections
|
| 10 |
+
import numpy
|
| 11 |
+
import tensorflow
|
| 12 |
+
import keras
|
| 13 |
+
import CorpusLoader
|
| 14 |
+
import CorpusRepeater
|
| 15 |
+
|
| 16 |
+
class CombinedCorpus(keras.utils.Sequence):
|
| 17 |
+
|
| 18 |
+
def __init__(self,tokenizer,**kwargs):
|
| 19 |
+
"""
|
| 20 |
+
Creates the Combined Corpus
|
| 21 |
+
|
| 22 |
+
Parameters
|
| 23 |
+
----------
|
| 24 |
+
tokenizer : tokenizers.Tokenizer
|
| 25 |
+
Tokenizer used in preparing datasets
|
| 26 |
+
**kwargs : str
|
| 27 |
+
paths for tokenized datsets
|
| 28 |
+
|
| 29 |
+
Returns
|
| 30 |
+
-------
|
| 31 |
+
None.
|
| 32 |
+
|
| 33 |
+
"""
|
| 34 |
+
super(CombinedCorpus,self).__init__()
|
| 35 |
+
self.tokenizer = tokenizer
|
| 36 |
+
start_doc = tokenizer.encode('<s>')
|
| 37 |
+
end_doc = tokenizer.encode('</s>')
|
| 38 |
+
self.all_text = CorpusLoader.CorpusLoader(kwargs['all_text'],
|
| 39 |
+
start_doc,
|
| 40 |
+
end_doc,
|
| 41 |
+
['all_text'],
|
| 42 |
+
{'all_text':('offset_text',
|
| 43 |
+
'encode_decode')})
|
| 44 |
+
n_samples = len(self.all_text)
|
| 45 |
+
self.n_batches = numpy.ceil(n_samples/32.0).astype(int)
|
| 46 |
+
self.question_answering = CorpusRepeater.CorpusRepeater(CorpusLoader.CorpusLoader(kwargs['question_answering'],
|
| 47 |
+
start_doc,
|
| 48 |
+
end_doc,
|
| 49 |
+
['question',
|
| 50 |
+
'answer'],
|
| 51 |
+
{}),
|
| 52 |
+
n_samples)
|
| 53 |
+
self.reasoning = CorpusRepeater.CorpusRepeater(CorpusLoader.CorpusLoader(kwargs['reasoning'],
|
| 54 |
+
start_doc,
|
| 55 |
+
end_doc,
|
| 56 |
+
['proposition0',
|
| 57 |
+
'proposition1'],
|
| 58 |
+
{'conclusion':('conclusion_offset',
|
| 59 |
+
'reasoning')}),
|
| 60 |
+
n_samples)
|
| 61 |
+
self.consistency = CorpusRepeater.CorpusRepeater(CorpusLoader.CorpusLoader(kwargs['consitency'],
|
| 62 |
+
start_doc,
|
| 63 |
+
end_doc,
|
| 64 |
+
['statement0',
|
| 65 |
+
'statement1'],
|
| 66 |
+
{},
|
| 67 |
+
'consistency'),
|
| 68 |
+
n_samples)
|
| 69 |
+
self.batches = []
|
| 70 |
+
self.pad_token = self.tokenizer.token_to_id('<pad>')
|
| 71 |
+
self.on_epoch_end()
|
| 72 |
+
|
| 73 |
+
def __len__(self):
|
| 74 |
+
"""
|
| 75 |
+
Number of batches
|
| 76 |
+
|
| 77 |
+
Returns
|
| 78 |
+
-------
|
| 79 |
+
int
|
| 80 |
+
Number of batches
|
| 81 |
+
|
| 82 |
+
"""
|
| 83 |
+
return self.n_batches
|
| 84 |
+
|
| 85 |
+
def __getitem__(self,n):
|
| 86 |
+
"""
|
| 87 |
+
Retrieves a batch of data
|
| 88 |
+
|
| 89 |
+
Parameters
|
| 90 |
+
----------
|
| 91 |
+
n : int
|
| 92 |
+
index of batch to retrieve
|
| 93 |
+
|
| 94 |
+
Returns
|
| 95 |
+
-------
|
| 96 |
+
tupe(dict,dict)
|
| 97 |
+
Batch of data
|
| 98 |
+
|
| 99 |
+
"""
|
| 100 |
+
return self.batches[n]
|
| 101 |
+
|
| 102 |
+
def samples(self):
|
| 103 |
+
"""
|
| 104 |
+
Iterates over samples of data
|
| 105 |
+
|
| 106 |
+
Yields
|
| 107 |
+
------
|
| 108 |
+
X : dict
|
| 109 |
+
Sample of training inputs
|
| 110 |
+
Y : dict
|
| 111 |
+
Sample of training outputs
|
| 112 |
+
|
| 113 |
+
"""
|
| 114 |
+
for sample in zip(self.all_text,
|
| 115 |
+
self.question_answering,
|
| 116 |
+
self.reasoning,
|
| 117 |
+
self.consistency):
|
| 118 |
+
X={}
|
| 119 |
+
Y={}
|
| 120 |
+
for (x,y) in sample:
|
| 121 |
+
X.update(x)
|
| 122 |
+
Y.update(y)
|
| 123 |
+
yield (X,Y)
|
| 124 |
+
|
| 125 |
+
def on_epoch_end(self):
|
| 126 |
+
"""
|
| 127 |
+
Regenerates batches of data
|
| 128 |
+
|
| 129 |
+
Returns
|
| 130 |
+
-------
|
| 131 |
+
None.
|
| 132 |
+
|
| 133 |
+
"""
|
| 134 |
+
self.batches = []
|
| 135 |
+
n=0
|
| 136 |
+
X = collections.defaultdict(list)
|
| 137 |
+
Y = collections.defaultdict(list)
|
| 138 |
+
for (x,y) in self.samples:
|
| 139 |
+
for (key,value) in x.items:
|
| 140 |
+
X[key].append(value)
|
| 141 |
+
for (key,value) in y.items():
|
| 142 |
+
Y[key].append(value)
|
| 143 |
+
n+=1
|
| 144 |
+
if n==32:
|
| 145 |
+
self.batches.append(self.batch(X,Y))
|
| 146 |
+
n=0
|
| 147 |
+
X.clear()
|
| 148 |
+
Y.clear()
|
| 149 |
+
if n!=0:
|
| 150 |
+
self.batches.append(self.batch(X,Y,n))
|
| 151 |
+
|
| 152 |
+
def batch(self,X,Y,n=32):
|
| 153 |
+
"""
|
| 154 |
+
Creates a batch of data from samples
|
| 155 |
+
|
| 156 |
+
Parameters
|
| 157 |
+
----------
|
| 158 |
+
X : dict[str,list]
|
| 159 |
+
Input samples
|
| 160 |
+
Y : dict[str.list]
|
| 161 |
+
output samples
|
| 162 |
+
n : int, optional
|
| 163 |
+
Size of batch. The default is 32.
|
| 164 |
+
|
| 165 |
+
Returns
|
| 166 |
+
-------
|
| 167 |
+
X : dict[str,tensorflow.Tensor]
|
| 168 |
+
Batched input samples
|
| 169 |
+
Y : dict[str,tensorflow.Tensor]
|
| 170 |
+
Batched output samples
|
| 171 |
+
|
| 172 |
+
"""
|
| 173 |
+
for (key,value) in X.items():
|
| 174 |
+
X[key] = self.pad(value)
|
| 175 |
+
for (key,value) in Y.items():
|
| 176 |
+
Y[key] = tensorflow.constant(value) if key=='consistency' else self.pad(value)
|
| 177 |
+
Y['question_answering'] = tensorflow.zeros((n,768))
|
| 178 |
+
return (X,Y)
|
| 179 |
+
|
| 180 |
+
def pad(self,batch):
|
| 181 |
+
"""
|
| 182 |
+
Pads a batch of samples to uniform length
|
| 183 |
+
|
| 184 |
+
Parameters
|
| 185 |
+
----------
|
| 186 |
+
batch : list[tokenizers.Encoding]
|
| 187 |
+
Samples to be padded
|
| 188 |
+
|
| 189 |
+
Returns
|
| 190 |
+
-------
|
| 191 |
+
tensorflow.Tensor
|
| 192 |
+
Padded data
|
| 193 |
+
|
| 194 |
+
"""
|
| 195 |
+
maxlen = max((len(sample) for sample in batch))
|
| 196 |
+
return tensorflow.constant([sample.pad(maxlen,pad_id=self.pad_token).ids
|
| 197 |
+
for sample in batch])
|
| 198 |
+
|
| 199 |
+
|
qarac/corpora/CorpusLoader.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
"""
|
| 4 |
+
Created on Wed Sep 20 07:48:54 2023
|
| 5 |
+
|
| 6 |
+
@author: peter
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import datasets
|
| 10 |
+
import tokenizers
|
| 11 |
+
|
| 12 |
+
class CorpusLoader(object):
|
| 13 |
+
|
| 14 |
+
def __init__(self,path,
|
| 15 |
+
start_doc,
|
| 16 |
+
end_doc,
|
| 17 |
+
text_inputs,
|
| 18 |
+
text_outputs,
|
| 19 |
+
label=None):
|
| 20 |
+
"""
|
| 21 |
+
Creates the Corpus Loader
|
| 22 |
+
|
| 23 |
+
Parameters
|
| 24 |
+
----------
|
| 25 |
+
path : str
|
| 26 |
+
Path to load dataset from
|
| 27 |
+
start_doc : tokenizers.Encoding
|
| 28 |
+
Token id for document start character
|
| 29 |
+
end_doc : tokenizers.Encoding
|
| 30 |
+
Token id for the document end character
|
| 31 |
+
text_inputs : list[str]
|
| 32 |
+
Columns of the dataset to add to the inputs
|
| 33 |
+
text_outputs : dict[str,tuple[str]]
|
| 34 |
+
The columns of the dataset to add to the outputs. The key is the name
|
| 35 |
+
of the column in the original dataset, the first element of the tuple
|
| 36 |
+
is the name that the column prefixed with '<s>' will have in the
|
| 37 |
+
inputs, and the second element of the tuple is the name that the column
|
| 38 |
+
suffixed with '</s>' will have in the outputs
|
| 39 |
+
label : str, optional
|
| 40 |
+
A column of numerical labels to add to the outputs. The default is None.
|
| 41 |
+
|
| 42 |
+
Returns
|
| 43 |
+
-------
|
| 44 |
+
None.
|
| 45 |
+
|
| 46 |
+
"""
|
| 47 |
+
data = datasets.Dataset.from_file(path)
|
| 48 |
+
self.n_rows = len(data)
|
| 49 |
+
self.dataset = data.to_iterable_dataset()
|
| 50 |
+
self.start_doc = start_doc
|
| 51 |
+
self.end_doc = end_doc
|
| 52 |
+
self.text_inputs = text_inputs
|
| 53 |
+
self.text_outputs = text_outputs
|
| 54 |
+
self.label = label
|
| 55 |
+
|
| 56 |
+
def __len__(self):
|
| 57 |
+
"""
|
| 58 |
+
The length of the corpus
|
| 59 |
+
|
| 60 |
+
Returns
|
| 61 |
+
-------
|
| 62 |
+
int
|
| 63 |
+
The number of samples
|
| 64 |
+
|
| 65 |
+
"""
|
| 66 |
+
return self.n_rows
|
| 67 |
+
|
| 68 |
+
def __iter__(self):
|
| 69 |
+
"""
|
| 70 |
+
Generates samples in a random order
|
| 71 |
+
|
| 72 |
+
Yields
|
| 73 |
+
------
|
| 74 |
+
X : dict
|
| 75 |
+
Inputs for model
|
| 76 |
+
Y : dict
|
| 77 |
+
outputs for model
|
| 78 |
+
|
| 79 |
+
"""
|
| 80 |
+
for row in self.dataset.shuffle():
|
| 81 |
+
X={}
|
| 82 |
+
Y={}
|
| 83 |
+
for column in self.text_inputs:
|
| 84 |
+
X[column] = row[column]
|
| 85 |
+
for (column,(x_name),y_name) in self.text_outputs.items():
|
| 86 |
+
X[x_name] = tokenizers.Encoding.merge([self.start_doc,row[column]])
|
| 87 |
+
Y[y_name] = tokenizers.Encoding.merge([row[column],self.end_doc])
|
| 88 |
+
if self.label is not None:
|
| 89 |
+
Y[self.label]=row[self.label]
|
| 90 |
+
yield (X,Y)
|
| 91 |
+
|
qarac/corpora/Preprocessor.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
"""
|
| 4 |
+
Created on Mon Sep 18 13:18:59 2023
|
| 5 |
+
|
| 6 |
+
@author: peter
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import tokenizers
|
| 10 |
+
import datasets
|
| 11 |
+
import pandas
|
| 12 |
+
|
| 13 |
+
class Preprocessor(object):
|
| 14 |
+
|
| 15 |
+
def __init__(self,tokenizer_path='roberta-base'):
|
| 16 |
+
"""
|
| 17 |
+
Creates the preporcessor
|
| 18 |
+
|
| 19 |
+
Parameters
|
| 20 |
+
----------
|
| 21 |
+
tokenizer_path : str, optional
|
| 22 |
+
The path to the pretrained tokenizer . The default is 'roberta-base'.
|
| 23 |
+
|
| 24 |
+
Returns
|
| 25 |
+
-------
|
| 26 |
+
None.
|
| 27 |
+
|
| 28 |
+
"""
|
| 29 |
+
self.tokenizer = tokenizers.Tokenizer.from_pretrained(tokenizer_path)
|
| 30 |
+
self.start_token = self.tokenizer.encode('<s>')
|
| 31 |
+
self.end_token = self.tokenizer.encode('</s>')
|
| 32 |
+
|
| 33 |
+
def __call__(self,data):
|
| 34 |
+
"""
|
| 35 |
+
Tokenizes a column of data
|
| 36 |
+
|
| 37 |
+
Parameters
|
| 38 |
+
----------
|
| 39 |
+
data : pandas.Series
|
| 40 |
+
Column of text tata
|
| 41 |
+
|
| 42 |
+
Returns
|
| 43 |
+
-------
|
| 44 |
+
list[tokenizers.Encoding]
|
| 45 |
+
Tokenized data
|
| 46 |
+
|
| 47 |
+
"""
|
| 48 |
+
return self.tokenizer.encode_batch(data,add_special_tokens=False)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def combine(self,*args):
|
| 52 |
+
"""
|
| 53 |
+
Tokenises several data columns
|
| 54 |
+
|
| 55 |
+
Parameters
|
| 56 |
+
----------
|
| 57 |
+
*args : sequence of pandas.Series
|
| 58 |
+
.
|
| 59 |
+
|
| 60 |
+
Returns
|
| 61 |
+
-------
|
| 62 |
+
TYPE
|
| 63 |
+
DESCRIPTION.
|
| 64 |
+
|
| 65 |
+
"""
|
| 66 |
+
return self(pandas.concatenate(args))
|
| 67 |
+
|
| 68 |
+
def process_labels(self,data,column):
|
| 69 |
+
"""
|
| 70 |
+
Converts labels to numerical value for consitency objective
|
| 71 |
+
|
| 72 |
+
Parameters
|
| 73 |
+
----------
|
| 74 |
+
data : datasets.Dataset
|
| 75 |
+
dataset for which labels need to be converted
|
| 76 |
+
column : str
|
| 77 |
+
The column on which to apply label conversion
|
| 78 |
+
|
| 79 |
+
Returns
|
| 80 |
+
-------
|
| 81 |
+
datasets.Dataset
|
| 82 |
+
The dataset with the labels converted
|
| 83 |
+
|
| 84 |
+
"""
|
| 85 |
+
label_values = {'entailment':1.0,
|
| 86 |
+
'neutral':0.0,
|
| 87 |
+
'contradiction':-1.0}
|
| 88 |
+
return data.align_labels_with_mapping(label_values,
|
| 89 |
+
column)
|
| 90 |
+
|
| 91 |
+
|
qarac/models/QaracTrainerModel.py
CHANGED
|
@@ -53,13 +53,24 @@ class QuaracTrainerModel(keras.Model):
|
|
| 53 |
'conclusion_offset': tokenized text of conclusions for reasoning
|
| 54 |
objective, prefixed by '<s>'
|
| 55 |
'statement0': tokenized statement for consistency objective
|
|
|
|
| 56 |
training : Bool, optional
|
| 57 |
Not used. The default is None.
|
| 58 |
|
| 59 |
Returns
|
| 60 |
-------
|
| 61 |
-
results :
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
"""
|
| 65 |
results = {}
|
|
|
|
| 53 |
'conclusion_offset': tokenized text of conclusions for reasoning
|
| 54 |
objective, prefixed by '<s>'
|
| 55 |
'statement0': tokenized statement for consistency objective
|
| 56 |
+
'statement1: tokenized statement for consistency objective'
|
| 57 |
training : Bool, optional
|
| 58 |
Not used. The default is None.
|
| 59 |
|
| 60 |
Returns
|
| 61 |
-------
|
| 62 |
+
results : dict[str,tensorflow.tensor]
|
| 63 |
+
Fields are
|
| 64 |
+
'encode_decode': tokeniaed text from decoding of vectors produced by
|
| 65 |
+
answer encoder from 'all_text'
|
| 66 |
+
'question_answering': difference between vector produced by question
|
| 67 |
+
encoder for 'question' and answer encoder for
|
| 68 |
+
'answer'
|
| 69 |
+
'reasoning': tokenised text produced by decoder from sum of vectors
|
| 70 |
+
produced by answwr endocer for 'proposition0' and
|
| 71 |
+
'proposition1'
|
| 72 |
+
'consistency': cosine similarity of vectors produced by answer encoder
|
| 73 |
+
from 'statement0' and 'statement1'
|
| 74 |
|
| 75 |
"""
|
| 76 |
results = {}
|
qarac/utils/CoreferenceResolver.py
CHANGED
|
@@ -10,15 +10,51 @@ from allennlp.predictors.predictor import Predictor
|
|
| 10 |
import pandas
|
| 11 |
|
| 12 |
def clean(sentence):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
return sentence if sentence.strip().endswith('.') else sentence+'.'
|
| 14 |
|
| 15 |
class CoreferenceResolver(object):
|
| 16 |
|
| 17 |
def __init__(self):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
model_url = "https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz"
|
| 19 |
self.predictor = Predictor.from_path(model_url)
|
| 20 |
|
| 21 |
def __call__(self,group):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
tokenized = group.apply(clean).str.split()
|
| 23 |
line_breaks = tokenized.apply(len).cumsum()
|
| 24 |
doc = []
|
|
|
|
| 10 |
import pandas
|
| 11 |
|
| 12 |
def clean(sentence):
|
| 13 |
+
"""
|
| 14 |
+
Ensure sentence ends with full stop
|
| 15 |
+
|
| 16 |
+
Parameters
|
| 17 |
+
----------
|
| 18 |
+
sentence : str
|
| 19 |
+
Sentence to be cleaned
|
| 20 |
+
|
| 21 |
+
Returns
|
| 22 |
+
-------
|
| 23 |
+
str
|
| 24 |
+
Sentence with full stop at the end.
|
| 25 |
+
|
| 26 |
+
"""
|
| 27 |
return sentence if sentence.strip().endswith('.') else sentence+'.'
|
| 28 |
|
| 29 |
class CoreferenceResolver(object):
|
| 30 |
|
| 31 |
def __init__(self):
|
| 32 |
+
"""
|
| 33 |
+
Creates the Coreference resolver
|
| 34 |
+
|
| 35 |
+
Returns
|
| 36 |
+
-------
|
| 37 |
+
None.
|
| 38 |
+
|
| 39 |
+
"""
|
| 40 |
model_url = "https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz"
|
| 41 |
self.predictor = Predictor.from_path(model_url)
|
| 42 |
|
| 43 |
def __call__(self,group):
|
| 44 |
+
"""
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
Parameters
|
| 48 |
+
----------
|
| 49 |
+
group : pandas.Series
|
| 50 |
+
Sentences on which to perform coreference resolution
|
| 51 |
+
|
| 52 |
+
Returns
|
| 53 |
+
-------
|
| 54 |
+
pandas.Series
|
| 55 |
+
Sentences with coreferences resolved
|
| 56 |
+
|
| 57 |
+
"""
|
| 58 |
tokenized = group.apply(clean).str.split()
|
| 59 |
line_breaks = tokenized.apply(len).cumsum()
|
| 60 |
doc = []
|