File size: 6,575 Bytes
c47de87 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
import collections
import numpy as np
import datasets
import json
import os
from typing import Optional, Tuple
from tqdm.auto import tqdm
# the train data file is expected to have the format of dataset SQUAD v2.0
def load_dataset(dataset_path, split = 0.1, shuffle = True):
with open(dataset_path, 'r') as f:
data = json.load(f)["data"]
dataset = {'id': [],
'title': [],
'context': [],
'question': [],
'answers': []}
for topic in data:
title = topic["title"]
for p in topic["paragraphs"]:
for qas in p['qas']:
dataset['id'].append(qas['id'])
dataset['title'].append(title)
dataset['context'].append(p["context"])
dataset['question'].append(qas["question"])
dataset['answers'].append(qas["answers"])
# Since there is no train data and validation data before hand, we have to manually split it
N_SAMPLE = len(dataset['id'])
# If you want to shuffle the dataset, the shuffle parameter should be kept True
if (shuffle): perms = np.random.permutation(N_SAMPLE)
else: perms = list(range(N_SAMPLE))
train_ds = dict()
valid_ds = dict()
for name, assets in dataset.items():
mock = N_SAMPLE - int(split * N_SAMPLE)
train_ds[name] = [assets[i] for i in perms[:mock]]
valid_ds[name] = [assets[i] for i in perms[mock:]]
raw_dataset = datasets.DatasetDict()
raw_dataset['train'] = datasets.Dataset.from_dict(train_ds)
raw_dataset['valid'] = datasets.Dataset.from_dict(valid_ds)
return raw_dataset
def postprocess_qa_predictions(
features,
tokenizer,
predictions: Tuple[np.ndarray, np.ndarray],
n_best_size: int = 20,
max_answer_length: int = 30
):
'''
Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the
original contexts. This is the base postprocessing functions for models that only return start and end logits.
Args:
features: The processed dataset (see the main script for more information).
tokenizer: The tokenizer to decode ids of the answer back to text
predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
first dimension must match the number of elements of :obj:`features`.
n_best_size (:obj:`int`, `optional`, defaults to 20):
The total number of n-best predictions to generate when looking for an answer.
max_answer_length (:obj:`int`, `optional`, defaults to 30):
The maximum length of an answer that can be generated. This is needed because the start and end predictions
are not conditioned on one another.
"""
'''
if len(predictions) != 2: raise ValueError("`predictions` should be a tuple with two elements (start_logits, end_logits).")
if len(predictions[0]) != len(features): raise ValueError(f"Got {len(predictions[0])} predictions and {len(features)} features.")
all_start_logits, all_end_logits = predictions
# The dictionaries we have to fill.
all_predictions = collections.OrderedDict()
# Let's loop over all the examples!
for index, feature in enumerate(tqdm(features)):
min_null_prediction = None
prelim_predictions = []
# We grab the predictions of the model for this feature.
start_logits = all_start_logits[index]
end_logits = all_end_logits[index]
# Update minimum null prediction.
feature_null_score = start_logits[1] + end_logits[0]
if (min_null_prediction is None or min_null_prediction["score"] > feature_null_score):
min_null_prediction = {
"ids": (1, 0),
"score": feature_null_score
}
# Go through all possibilities for the `n_best_size` greater start and end logits.
start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
for start_index in start_indexes:
for end_index in end_indexes:
# Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
# to part of the input_ids that are not in the context.
if (start_index >= len(feature['input_ids'])
or end_index >= len(feature['input_ids'])
):
continue
# Don't consider answers with a length that is either < 0 or > max_answer_length.
if end_index < start_index or end_index - start_index + 1 > max_answer_length:
continue
prelim_predictions.append(
{
"ids": (start_index, end_index),
"score": start_logits[start_index] + end_logits[end_index]
}
)
if min_null_prediction is not None:
# Add the minimum null prediction
prelim_predictions.append(min_null_prediction)
null_score = min_null_prediction["score"]
# Only keep the best `n_best_size` predictions.
predictions = sorted(prelim_predictions,
key = lambda x: x["score"],
reverse = True)[:n_best_size]
# Add back the minimum null prediction if it was removed because of its low score.
if (min_null_prediction is not None and not any(p["ids"] == (1, 0) for p in predictions)):
predictions.append(min_null_prediction)
best_non_null_pred = None
for pred in predictions:
l, r = pred.pop("ids")
if (l <= r):
pred_input_ids = feature['input_ids'][l: r + 1]
pred_tokens = tokenizer.convert_ids_to_tokens(pred_input_ids)
pred_text = tokenizer.convert_tokens_to_string(pred_tokens)
pred["text"] = pred_text
best_non_null_pred = pred
break
if (best_non_null_pred is None or best_non_null_pred["score"] < null_score):
all_predictions[feature["id"]] = ""
else:
all_predictions[feature["id"]] = best_non_null_pred["text"]
return all_predictions
|