Spaces:
Build error
Build error
PeteBleackley commited on
Commit ·
c106121
1
Parent(s): 985ef96
Data preparation for WikiQA
Browse files- scripts.py +21 -0
scripts.py
CHANGED
|
@@ -8,12 +8,33 @@ import qarac.corpora.Batcher
|
|
| 8 |
import qarac.models.qarac_base_model
|
| 9 |
import keras
|
| 10 |
import tensorflow
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
def decoder_loss(y_true,y_pred):
|
| 13 |
return keras.losses.sparse_categorical_crossentropy(y_true,
|
| 14 |
y_pred.logits,
|
| 15 |
logits=True)
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
|
| 19 |
def train_base_model(task,filename):
|
|
|
|
| 8 |
import qarac.models.qarac_base_model
|
| 9 |
import keras
|
| 10 |
import tensorflow
|
| 11 |
+
import spacy
|
| 12 |
+
import spacy_experimental
|
| 13 |
+
import pandas
|
| 14 |
|
| 15 |
def decoder_loss(y_true,y_pred):
|
| 16 |
return keras.losses.sparse_categorical_crossentropy(y_true,
|
| 17 |
y_pred.logits,
|
| 18 |
logits=True)
|
| 19 |
|
| 20 |
+
def capitalise(token,i):
|
| 21 |
+
return token.text_with_ws.title() if i==0 or token.tag_.startswith('NNP') else token.text_with_ws.lower()
|
| 22 |
+
|
| 23 |
+
def clean_question(doc):
|
| 24 |
+
words = [capitalise(token,i) for (i,token) in enumerate(doc)]
|
| 25 |
+
if words[-1]!='?':
|
| 26 |
+
words.append('?')
|
| 27 |
+
return ''.join(words)
|
| 28 |
+
|
| 29 |
+
def prepare_wiki_qa(filename,outfilename):
|
| 30 |
+
data = pandas.read_csv(filename,sep='\t')
|
| 31 |
+
nlp = spacy.load('en_core_web_trf')
|
| 32 |
+
nlp.add_pipe('experimental_coref')
|
| 33 |
+
data['Resolved_answer'] = pandas.Series([sent.text
|
| 34 |
+
for doc in nlp.pipe(data.groupby('DocumentID')['Sentence'].apply(lambda x: ' '.join(x)))
|
| 35 |
+
for sent in doc.sentences])
|
| 36 |
+
data['Cleaned_questions']=pandas.Series([clean_question(doc) for doc in nlp.pipe(data)])
|
| 37 |
+
data[['Cleaned_questions','Resolved_answers','Label']].to_csv(outfilename)
|
| 38 |
|
| 39 |
|
| 40 |
def train_base_model(task,filename):
|