|
|
| import torch |
|
|
| def prepare_input(cfg, text, tokenizer): |
| """ |
| This function tokenizes the input text with the configured padding and truncation. Then, |
| returns the input dictionary, which contains the following keys: "input_ids", |
| "token_type_ids" and "attention_mask". Each value is a torch.tensor. |
| :param cfg: configuration class with a TOKENIZER attribute. |
| :param text: a numpy array where each value is a text as string. |
| :return inputs: python dictionary where values are torch tensors. |
| """ |
| inputs = tokenizer.encode_plus( |
| text, |
| return_tensors = None, |
| add_special_tokens = True, |
| max_length = cfg.MAX_LEN, |
| padding = 'max_length', |
| truncation = True |
| ) |
| for k, v in inputs.items(): |
| inputs[k] = torch.tensor(v, dtype=torch.long) |
| return inputs |
|
|
|
|
| def collate(inputs): |
| """ |
| It truncates the inputs to the maximum sequence length in the batch. |
| """ |
| mask_len = int(inputs["attention_mask"].sum(axis=1).max()) |
| for k, v in inputs.items(): |
| inputs[k] = inputs[k][:,:mask_len] |
| return inputs |
|
|
|
|
| class CustomDataset(Dataset): |
| def __init__(self, cfg, df, tokenizer): |
| self.cfg = cfg |
| self.texts = df['full_text'].values |
| self.labels = df['score'].values |
| self.tokenizer = tokenizer |
| self.essay_ids = df['essay_id'].values |
|
|
| def __len__(self): |
| return len(self.texts) |
|
|
| def __getitem__(self, item): |
| output = {} |
| output["inputs"] = prepare_input(self.cfg, self.texts[item], self.tokenizer) |
| output["labels"] = torch.tensor(self.labels[item], dtype=torch.long) |
| output["essay_ids"] = self.essay_ids[item] |
| return output |
|
|