botcon commited on
Commit
8838f38
1 Parent(s): 1455a6e

Upload QuestionAnswering.py

Browse files
Files changed (1) hide show
  1. QuestionAnswering.py +520 -0
QuestionAnswering.py ADDED
@@ -0,0 +1,520 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import LukePreTrainedModel, LukeModel, AutoTokenizer, TrainingArguments, default_data_collator, Trainer, AutoModelForQuestionAnswering
2
+ from transformers.modeling_outputs import ModelOutput
3
+ from typing import Optional, Tuple, Union
4
+
5
+ import numpy as np
6
+ from tqdm import tqdm
7
+ import evaluate
8
+ import torch
9
+ from dataclasses import dataclass
10
+ from datasets import load_dataset, concatenate_datasets
11
+ from torch import nn
12
+ from torch.nn import CrossEntropyLoss
13
+ import collections
14
+ import re
15
+
16
+ PEFT = False
17
+ tf32 = True
18
+ fp16= True
19
+ train = False
20
+ test = True
21
+ trained_model = "LUKE_squad_finetuned_qa_tf32"
22
+ train_checkpoint = None
23
+
24
+ # For testing
25
+ tokenizer_list = ["xlnet-base-cased", "roberta-base"]
26
+ model_list = ["XLNET_squad_finetuned_qa_tf32", "LUKE_squad_finetuned_qa_tf32"]
27
+ question_list = ["who", "what", "where", "when", "which", "how", "whom"]
28
+
29
+ base_tokenizer = "roberta-base"
30
+ base_model = "studio-ousia/luke-base"
31
+
32
+ # base_tokenizer = "xlnet-base-cased"
33
+ # base_model = "xlnet-base-cased"
34
+
35
+ # base_tokenizer = "bert-base-cased"
36
+ # base_model = "SpanBERT/spanbert-base-cased"
37
+
38
+ torch.backends.cuda.matmul.allow_tf32 = tf32
39
+ torch.backends.cudnn.allow_tf32 = tf32
40
+ device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
41
+
42
+ # https://github.com/huggingface/transformers/blob/v4.34.1/src/transformers/models/luke/modeling_luke.py#L319-L353
43
+ # Taken from HF repository, easier to include additional features -- Currently identical to LukeForQuestionAnswering by HF
44
+
45
+ @dataclass
46
+ class LukeQuestionAnsweringModelOutput(ModelOutput):
47
+ """
48
+ Outputs of question answering models.
49
+
50
+
51
+ Args:
52
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
53
+ Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
54
+ start_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
55
+ Span-start scores (before SoftMax).
56
+ end_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
57
+ Span-end scores (before SoftMax).
58
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
59
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
60
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
61
+
62
+
63
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
64
+ entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
65
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
66
+ shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
67
+ layer plus the initial entity embedding outputs.
68
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
69
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
70
+ sequence_length)`.
71
+
72
+
73
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
74
+ heads.
75
+ """
76
+
77
+
78
+ loss: Optional[torch.FloatTensor] = None
79
+ start_logits: torch.FloatTensor = None
80
+ end_logits: torch.FloatTensor = None
81
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
82
+ entity_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
83
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
84
+
85
+ class AugmentedLukeForQuestionAnswering(LukePreTrainedModel):
86
+ def __init__(self, config):
87
+ super().__init__(config)
88
+
89
+ # This is 2.
90
+ self.num_labels = config.num_labels
91
+
92
+ self.luke = LukeModel(config, add_pooling_layer=False)
93
+
94
+ '''
95
+ Any improvement to the model are expected here. Additional features, anything...
96
+ '''
97
+ self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
98
+
99
+
100
+ # Initialize weights and apply final processing
101
+ self.post_init()
102
+
103
+ def forward(
104
+ self,
105
+ input_ids: Optional[torch.LongTensor] = None,
106
+ attention_mask: Optional[torch.FloatTensor] = None,
107
+ token_type_ids: Optional[torch.LongTensor] = None,
108
+ position_ids: Optional[torch.FloatTensor] = None,
109
+ entity_ids: Optional[torch.LongTensor] = None,
110
+ entity_attention_mask: Optional[torch.FloatTensor] = None,
111
+ entity_token_type_ids: Optional[torch.LongTensor] = None,
112
+ entity_position_ids: Optional[torch.LongTensor] = None,
113
+ head_mask: Optional[torch.FloatTensor] = None,
114
+ inputs_embeds: Optional[torch.FloatTensor] = None,
115
+ start_positions: Optional[torch.LongTensor] = None,
116
+ end_positions: Optional[torch.LongTensor] = None,
117
+ output_attentions: Optional[bool] = None,
118
+ output_hidden_states: Optional[bool] = None,
119
+ return_dict: Optional[bool] = None,
120
+ ) -> Union[Tuple, LukeQuestionAnsweringModelOutput]:
121
+
122
+ r"""
123
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
124
+ Labels for position (index) of the start of the labelled span for computing the token classification loss.
125
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
126
+ are not taken into account for computing the loss.
127
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
128
+ Labels for position (index) of the end of the labelled span for computing the token classification loss.
129
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
130
+ are not taken into account for computing the loss.
131
+ """
132
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
133
+
134
+
135
+ outputs = self.luke(
136
+ input_ids=input_ids,
137
+ attention_mask=attention_mask,
138
+ token_type_ids=token_type_ids,
139
+ position_ids=position_ids,
140
+ entity_ids=entity_ids,
141
+ entity_attention_mask=entity_attention_mask,
142
+ entity_token_type_ids=entity_token_type_ids,
143
+ entity_position_ids=entity_position_ids,
144
+ head_mask=head_mask,
145
+ inputs_embeds=inputs_embeds,
146
+ output_attentions=output_attentions,
147
+ output_hidden_states=output_hidden_states,
148
+ return_dict=True,
149
+ )
150
+
151
+
152
+ sequence_output = outputs.last_hidden_state
153
+
154
+
155
+ logits = self.qa_outputs(sequence_output)
156
+ start_logits, end_logits = logits.split(1, dim=-1)
157
+ start_logits = start_logits.squeeze(-1)
158
+ end_logits = end_logits.squeeze(-1)
159
+
160
+
161
+ total_loss = None
162
+ if start_positions is not None and end_positions is not None:
163
+ # If we are on multi-GPU, split add a dimension
164
+ if len(start_positions.size()) > 1:
165
+ start_positions = start_positions.squeeze(-1)
166
+ if len(end_positions.size()) > 1:
167
+ end_positions = end_positions.squeeze(-1)
168
+ # sometimes the start/end positions are outside our model inputs, we ignore these terms
169
+ ignored_index = start_logits.size(1)
170
+ start_positions.clamp_(0, ignored_index)
171
+ end_positions.clamp_(0, ignored_index)
172
+
173
+ loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
174
+ start_loss = loss_fct(start_logits, start_positions)
175
+ end_loss = loss_fct(end_logits, end_positions)
176
+ total_loss = (start_loss + end_loss) / 2
177
+
178
+
179
+ if not return_dict:
180
+ return tuple(
181
+ v
182
+ for v in [
183
+ total_loss,
184
+ start_logits,
185
+ end_logits,
186
+ outputs.hidden_states,
187
+ outputs.entity_hidden_states,
188
+ outputs.attentions,
189
+ ]
190
+ if v is not None
191
+ )
192
+
193
+
194
+ return LukeQuestionAnsweringModelOutput(
195
+ loss=total_loss,
196
+ start_logits=start_logits,
197
+ end_logits=end_logits,
198
+ hidden_states=outputs.hidden_states,
199
+ entity_hidden_states=outputs.entity_hidden_states,
200
+ attentions=outputs.attentions,
201
+ )
202
+
203
+ # Get data to train model - squadshift is designed as a validation/testing set, so there are multiple answers, take the shortest
204
+ def get_squadshifts_training():
205
+ wiki = load_dataset("squadshifts", "new_wiki")["test"]
206
+ nyt = load_dataset("squadshifts", "nyt")["test"]
207
+ reddit = load_dataset("squadshifts", "reddit")["test"]
208
+ raw_dataset = concatenate_datasets([wiki, nyt, reddit])
209
+ updated = raw_dataset.map(validation_to_train)
210
+ return updated
211
+
212
+ def validation_to_train(example):
213
+ answers = example["answers"]
214
+ answer_text = answers["text"]
215
+ index_min = min(range(len(answer_text)), key=lambda x : len(answer_text.__getitem__(x)))
216
+ answers["text"] = answers["text"][index_min:index_min+1]
217
+ answers["answer_start"] = answers["answer_start"][index_min:index_min+1]
218
+ return example
219
+
220
+ # Get subset with specific question word
221
+ def get_dataset(dataset, pattern):
222
+ return dataset.filter(lambda x : bool(re.search(r"\b{}\b".format(pattern), x["question"], flags=re.IGNORECASE)))
223
+
224
+ if __name__ == "__main__":
225
+ # Setting up tokenizer and helper functions
226
+ # Work-around for FastTokenizer - RoBERTa and LUKE share the same subword vocab, and we are not using entities functions of LUKE-tokenizer anyways
227
+ tokenizer = AutoTokenizer.from_pretrained(base_tokenizer)
228
+
229
+ # Necessary initialization
230
+ max_length = 500
231
+ stride = 128
232
+ batch_size = 8
233
+ n_best = 20
234
+ max_answer_length = 30
235
+ metric = evaluate.load("squad")
236
+ raw_datasets = load_dataset("squad")
237
+
238
+ raw_train = raw_datasets["train"]
239
+ raw_validation = raw_datasets["validation"]
240
+
241
+ def compute_metrics(start_logits, end_logits, features, examples):
242
+ example_to_features = collections.defaultdict(list)
243
+ for idx, feature in enumerate(features):
244
+ example_to_features[feature["example_id"]].append(idx)
245
+
246
+ predicted_answers = []
247
+ for example in tqdm(examples):
248
+ example_id = example["id"]
249
+ context = example["context"]
250
+ answers = []
251
+
252
+ # Loop through all features associated with that example
253
+ for feature_index in example_to_features[example_id]:
254
+ start_logit = start_logits[feature_index]
255
+ end_logit = end_logits[feature_index]
256
+ offsets = features[feature_index]["offset_mapping"]
257
+
258
+ start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
259
+ end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
260
+ for start_index in start_indexes:
261
+ for end_index in end_indexes:
262
+ # Skip answers that are not fully in the context
263
+ if offsets[start_index] is None or offsets[end_index] is None:
264
+ continue
265
+ # Skip answers with a length that is either < 0 or > max_answer_length
266
+ if (
267
+ end_index < start_index
268
+ or end_index - start_index + 1 > max_answer_length
269
+ ):
270
+ continue
271
+
272
+ answer = {
273
+ "text": context[offsets[start_index][0] : offsets[end_index][1]],
274
+ "logit_score": start_logit[start_index] + end_logit[end_index],
275
+ }
276
+ answers.append(answer)
277
+
278
+ # Select the answer with the best score
279
+ if len(answers) > 0:
280
+ best_answer = max(answers, key=lambda x: x["logit_score"])
281
+ predicted_answers.append(
282
+ {"id": example_id, "prediction_text": best_answer["text"]}
283
+ )
284
+ else:
285
+ predicted_answers.append({"id": example_id, "prediction_text": ""})
286
+
287
+ theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
288
+ return metric.compute(predictions=predicted_answers, references=theoretical_answers)
289
+
290
+ def preprocess_training_examples(examples):
291
+
292
+ questions = [q.strip() for q in examples["question"]]
293
+ inputs = tokenizer(
294
+ questions,
295
+ examples["context"],
296
+ max_length=max_length,
297
+ truncation="only_second",
298
+ stride=stride,
299
+ return_overflowing_tokens=True,
300
+ return_offsets_mapping=True,
301
+ padding="max_length",
302
+ )
303
+
304
+ offset_mapping = inputs.pop("offset_mapping")
305
+ sample_map = inputs.pop("overflow_to_sample_mapping")
306
+ answers = examples["answers"]
307
+ start_positions = []
308
+ end_positions = []
309
+
310
+ for i, offset in enumerate(offset_mapping):
311
+ sample_idx = sample_map[i]
312
+ answer = answers[sample_idx]
313
+ start_char = answer["answer_start"][0]
314
+ end_char = answer["answer_start"][0] + len(answer["text"][0])
315
+ sequence_ids = inputs.sequence_ids(i)
316
+
317
+ # Find the start and end of the context
318
+ idx = 0
319
+ while sequence_ids[idx] != 1:
320
+ idx += 1
321
+ context_start = idx
322
+ while sequence_ids[idx] == 1:
323
+ idx += 1
324
+ context_end = idx - 1
325
+
326
+ # If the answer is not fully inside the context, label is (0, 0)
327
+ if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
328
+ start_positions.append(0)
329
+ end_positions.append(0)
330
+ else:
331
+ # Otherwise it's the start and end token positions
332
+ idx = context_start
333
+ while idx <= context_end and offset[idx][0] <= start_char:
334
+ idx += 1
335
+ start_positions.append(idx - 1)
336
+
337
+ idx = context_end
338
+ while idx >= context_start and offset[idx][1] >= end_char:
339
+ idx -= 1
340
+ end_positions.append(idx + 1)
341
+
342
+ inputs["start_positions"] = start_positions
343
+ inputs["end_positions"] = end_positions
344
+ return inputs
345
+
346
+ def preprocess_validation_examples(examples):
347
+ questions = [q.strip() for q in examples["question"]]
348
+ inputs = tokenizer(
349
+ questions,
350
+ examples["context"],
351
+ max_length=max_length,
352
+ truncation="only_second",
353
+ stride=stride,
354
+ return_overflowing_tokens=True,
355
+ return_offsets_mapping=True,
356
+ padding="max_length",
357
+ )
358
+
359
+
360
+ sample_map = inputs.pop("overflow_to_sample_mapping")
361
+ example_ids = []
362
+
363
+ for i in range(len(inputs["input_ids"])):
364
+ sample_idx = sample_map[i]
365
+ example_ids.append(examples["id"][sample_idx])
366
+
367
+ sequence_ids = inputs.sequence_ids(i)
368
+ offset = inputs["offset_mapping"][i]
369
+ inputs["offset_mapping"][i] = [
370
+ o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
371
+ ]
372
+
373
+ inputs["example_id"] = example_ids
374
+ return inputs
375
+
376
+ if train:
377
+
378
+ model = AutoModelForQuestionAnswering.from_pretrained(base_model).to(device)
379
+
380
+ # For squadshift
381
+ raw_train = get_squadshifts_training()
382
+
383
+ train_dataset = raw_train.map(
384
+ preprocess_training_examples,
385
+ batched=True,
386
+ remove_columns=raw_train.column_names,
387
+ )
388
+
389
+ validation_dataset = raw_validation.map(
390
+ preprocess_validation_examples,
391
+ batched=True,
392
+ remove_columns=raw_validation.column_names,
393
+ )
394
+
395
+ # --------------- PEFT -------------------- # One epoch without PEFT took about 2h on my computer with CUDA - performance of PEFT kinda ass though
396
+ if PEFT:
397
+ from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
398
+
399
+ # ---- For all linear layers ----
400
+ import re
401
+ pattern = r'\((\w+)\): Linear'
402
+ linear_layers = re.findall(pattern, str(model.modules))
403
+ target_modules = list(set(linear_layers))
404
+
405
+ # If using peft, can consider increaisng r for better performance
406
+ peft_config = LoraConfig(
407
+ task_type=TaskType.QUESTION_ANS, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1, target_modules=target_modules, bias='all'
408
+ )
409
+
410
+ model = get_peft_model(model, peft_config)
411
+ model.print_trainable_parameters()
412
+
413
+ trained_model += "_PEFT"
414
+
415
+ # ------------------------------------------ #
416
+
417
+ args = TrainingArguments(
418
+ trained_model,
419
+ evaluation_strategy = "no",
420
+ save_strategy="epoch",
421
+ learning_rate=2e-5,
422
+ per_device_train_batch_size=batch_size,
423
+ per_device_eval_batch_size=batch_size,
424
+ num_train_epochs=3,
425
+ weight_decay=0.01,
426
+ push_to_hub=True,
427
+ fp16=fp16
428
+ )
429
+
430
+ trainer = Trainer(
431
+ model,
432
+ args,
433
+ train_dataset=train_dataset,
434
+ eval_dataset=validation_dataset,
435
+ data_collator=default_data_collator,
436
+ tokenizer=tokenizer
437
+ )
438
+
439
+ trainer.train(train_checkpoint)
440
+
441
+ if test:
442
+ out = "out.txt"
443
+ for j in range(len(tokenizer_list)):
444
+ model = AutoModelForQuestionAnswering.from_pretrained(model_list[j]).to(device)
445
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_list[j])
446
+ # Normal case
447
+ # test_validation = raw_validation
448
+ for question in question_list:
449
+ test_validation = get_dataset(raw_validation, question)
450
+ exact_match = 0
451
+ f1 = 0
452
+ validation_size = 100
453
+ start = 0
454
+ end = validation_size
455
+
456
+ with torch.no_grad():
457
+ while start < len(test_validation):
458
+ small_eval_set = test_validation.select(range(start, min(end, len(test_validation))))
459
+ eval_set = small_eval_set.map(
460
+ preprocess_validation_examples,
461
+ batched=True,
462
+ remove_columns=test_validation.column_names
463
+ )
464
+ eval_set_for_model = eval_set.remove_columns(["example_id", "offset_mapping"])
465
+ eval_set_for_model.set_format("torch")
466
+ batch = {k: eval_set_for_model[k].to(device) for k in eval_set_for_model.column_names}
467
+ outputs = model(**batch)
468
+ start_logits = outputs.start_logits.cpu().numpy()
469
+ end_logits = outputs.end_logits.cpu().numpy()
470
+ res = compute_metrics(start_logits, end_logits, eval_set, small_eval_set)
471
+ exact_match += res['exact_match'] * (len(small_eval_set) / len(test_validation))
472
+ f1 += res["f1"] * (len(small_eval_set) / len(test_validation))
473
+ start += validation_size
474
+ end += validation_size
475
+
476
+ print("F1 score: {}".format(f1))
477
+ print("Exact match: {}".format(exact_match))
478
+ with open(out, "a+") as file:
479
+ file.write("Model: {}, Question: {}, Size: {}".format(model_list[j], question, len(test_validation)))
480
+ file.write("\n")
481
+ file.write("F1 score: {}".format(f1))
482
+ file.write("\n")
483
+ file.write("Exact match: {}".format(exact_match))
484
+ file.write("\n")
485
+
486
+ # LUKE
487
+ # F1 score: 92.4
488
+ # EM: 85.9
489
+
490
+ # XLNET
491
+ # F1 score: 91.54154256653278
492
+ # Exact match: 84.86666666666666
493
+
494
+ # SpanBERT
495
+ # F1 score: 92.160285362531
496
+ # Exact match: 85.73333333333333
497
+
498
+ # LUKE SQUADSHIFT (SQUAD then SQUADSHIFT)
499
+ # F1 score: 91.27683543983473
500
+ # Exact match: 84.96190476190473
501
+
502
+ # LUKE SQUAD on WHO question only
503
+ # F1 score: 95.10756796200876
504
+ # Exact match: 92.03125
505
+
506
+ # LUKE SQUAD on WHICH question only
507
+ # F1 score: 92.40873428373428
508
+ # Exact match: 87.43243243243242
509
+
510
+ # LUKE SQUAD on WHAT question only
511
+ # F1 score: 92.09871080377772
512
+ # Exact match: 85.56105610561056
513
+
514
+ # LUKE SQUAD on WHERE question only
515
+ # F1 score: 90.1197551009935
516
+ # Exact match: 82.8
517
+
518
+ # LUKE SQUAD on HOW question only
519
+ # F1 score: 91.29310175269578
520
+ # Exact match: 82.09677419354838