PeteBleackley commited on
Commit
c106121
·
1 Parent(s): 985ef96

Data preparation for WikiQA

Browse files
Files changed (1) hide show
  1. scripts.py +21 -0
scripts.py CHANGED
@@ -8,12 +8,33 @@ import qarac.corpora.Batcher
8
  import qarac.models.qarac_base_model
9
  import keras
10
  import tensorflow
 
 
 
11
 
12
  def decoder_loss(y_true,y_pred):
13
  return keras.losses.sparse_categorical_crossentropy(y_true,
14
  y_pred.logits,
15
  logits=True)
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
 
19
  def train_base_model(task,filename):
 
8
  import qarac.models.qarac_base_model
9
  import keras
10
  import tensorflow
11
+ import spacy
12
+ import spacy_experimental
13
+ import pandas
14
 
15
  def decoder_loss(y_true,y_pred):
16
  return keras.losses.sparse_categorical_crossentropy(y_true,
17
  y_pred.logits,
18
  logits=True)
19
 
20
+ def capitalise(token,i):
21
+ return token.text_with_ws.title() if i==0 or token.tag_.startswith('NNP') else token.text_with_ws.lower()
22
+
23
+ def clean_question(doc):
24
+ words = [capitalise(token,i) for (i,token) in enumerate(doc)]
25
+ if words[-1]!='?':
26
+ words.append('?')
27
+ return ''.join(words)
28
+
29
+ def prepare_wiki_qa(filename,outfilename):
30
+ data = pandas.read_csv(filename,sep='\t')
31
+ nlp = spacy.load('en_core_web_trf')
32
+ nlp.add_pipe('experimental_coref')
33
+ data['Resolved_answer'] = pandas.Series([sent.text
34
+ for doc in nlp.pipe(data.groupby('DocumentID')['Sentence'].apply(lambda x: ' '.join(x)))
35
+ for sent in doc.sentences])
36
+ data['Cleaned_questions']=pandas.Series([clean_question(doc) for doc in nlp.pipe(data)])
37
+ data[['Cleaned_questions','Resolved_answers','Label']].to_csv(outfilename)
38
 
39
 
40
  def train_base_model(task,filename):