PeteBleackley commited on
Commit
9ca9d81
·
1 Parent(s): 5f8e115

Decided to use pandas rather than datasets

Browse files
qarac/corpora/CombinedCorpus.py CHANGED
@@ -32,42 +32,35 @@ class CombinedCorpus(keras.utils.Sequence):
32
 
33
  """
34
  super(CombinedCorpus,self).__init__()
35
- self.tokenizer = tokenizer
36
- start_doc = tokenizer.encode('<s>')
37
- end_doc = tokenizer.encode('</s>')
38
  self.all_text = CorpusLoader.CorpusLoader(kwargs['all_text'],
39
- start_doc,
40
- end_doc,
41
  ['all_text'],
42
  {'all_text':('offset_text',
43
  'encode_decode')})
44
  n_samples = len(self.all_text)
45
  self.n_batches = numpy.ceil(n_samples/32.0).astype(int)
46
  self.question_answering = CorpusRepeater.CorpusRepeater(CorpusLoader.CorpusLoader(kwargs['question_answering'],
47
- start_doc,
48
- end_doc,
49
  ['question',
50
  'answer'],
51
  {}),
52
  n_samples)
53
  self.reasoning = CorpusRepeater.CorpusRepeater(CorpusLoader.CorpusLoader(kwargs['reasoning'],
54
- start_doc,
55
- end_doc,
56
  ['proposition0',
57
  'proposition1'],
58
  {'conclusion':('conclusion_offset',
59
  'reasoning')}),
60
  n_samples)
61
  self.consistency = CorpusRepeater.CorpusRepeater(CorpusLoader.CorpusLoader(kwargs['consitency'],
62
- start_doc,
63
- end_doc,
64
  ['statement0',
65
  'statement1'],
66
  {},
67
  'consistency'),
68
  n_samples)
69
  self.batches = []
70
- self.pad_token = self.tokenizer.token_to_id('<pad>')
71
  self.on_epoch_end()
72
 
73
  def __len__(self):
 
32
 
33
  """
34
  super(CombinedCorpus,self).__init__()
 
 
 
35
  self.all_text = CorpusLoader.CorpusLoader(kwargs['all_text'],
36
+ tokenizer,
 
37
  ['all_text'],
38
  {'all_text':('offset_text',
39
  'encode_decode')})
40
  n_samples = len(self.all_text)
41
  self.n_batches = numpy.ceil(n_samples/32.0).astype(int)
42
  self.question_answering = CorpusRepeater.CorpusRepeater(CorpusLoader.CorpusLoader(kwargs['question_answering'],
43
+ tokenizer,
 
44
  ['question',
45
  'answer'],
46
  {}),
47
  n_samples)
48
  self.reasoning = CorpusRepeater.CorpusRepeater(CorpusLoader.CorpusLoader(kwargs['reasoning'],
49
+ tokenizer,
 
50
  ['proposition0',
51
  'proposition1'],
52
  {'conclusion':('conclusion_offset',
53
  'reasoning')}),
54
  n_samples)
55
  self.consistency = CorpusRepeater.CorpusRepeater(CorpusLoader.CorpusLoader(kwargs['consitency'],
56
+ tokenizer,
 
57
  ['statement0',
58
  'statement1'],
59
  {},
60
  'consistency'),
61
  n_samples)
62
  self.batches = []
63
+ self.pad_token = tokenizer.token_to_id('<pad>')
64
  self.on_epoch_end()
65
 
66
  def __len__(self):
qarac/corpora/CorpusLoader.py CHANGED
@@ -6,14 +6,13 @@ Created on Wed Sep 20 07:48:54 2023
6
  @author: peter
7
  """
8
 
9
- import datasets
10
  import tokenizers
11
 
12
  class CorpusLoader(object):
13
 
14
  def __init__(self,path,
15
- start_doc,
16
- end_doc,
17
  text_inputs,
18
  text_outputs,
19
  label=None):
@@ -44,14 +43,22 @@ class CorpusLoader(object):
44
  None.
45
 
46
  """
47
- data = datasets.Dataset.from_file(path)
48
- self.n_rows = len(data)
49
- self.dataset = data.to_iterable_dataset()
50
- self.start_doc = start_doc
51
- self.end_doc = end_doc
52
  self.text_inputs = text_inputs
53
  self.text_outputs = text_outputs
54
  self.label = label
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  def __len__(self):
57
  """
@@ -77,7 +84,8 @@ class CorpusLoader(object):
77
  outputs for model
78
 
79
  """
80
- for row in self.dataset.shuffle():
 
81
  X={}
82
  Y={}
83
  for column in self.text_inputs:
 
6
  @author: peter
7
  """
8
 
9
+ import numpy
10
  import tokenizers
11
 
12
  class CorpusLoader(object):
13
 
14
  def __init__(self,path,
15
+ tokenizer
 
16
  text_inputs,
17
  text_outputs,
18
  label=None):
 
43
  None.
44
 
45
  """
46
+ data = pandas.read_csv(path)
47
+ self.n_rows = data.shape[0]
 
 
 
48
  self.text_inputs = text_inputs
49
  self.text_outputs = text_outputs
50
  self.label = label
51
+ self.rng = numpy.random.default_rng()
52
+ columns = list(set(self.text_inputs)|set(self.text_outputs.keys()))
53
+ tokenized = {column:tokenizer.encode_batch(data[column],
54
+ add_special_tokens=False)}
55
+ if self.label is not None:
56
+ tokenized[self.label] = data[self.label]
57
+ self.dataset = [{column:tokenized[column][i]
58
+ for column in columns}
59
+ for i in range(self.n_rows)]
60
+ self.start_doc = tokenizer.encode('<s>')
61
+ self.end_doc = tokenizer.encode('</s>')
62
 
63
  def __len__(self):
64
  """
 
84
  outputs for model
85
 
86
  """
87
+ self.rng.shuffle(self.dataset)
88
+ for row in self.dataset:
89
  X={}
90
  Y={}
91
  for column in self.text_inputs: