| | from torch.utils.data import DataLoader, Dataset |
| | import torch |
| | from transformers import BertTokenizerFast, BertModel |
| | from transformers import BertConfig, BertPreTrainedModel |
| | import numpy as np |
| | from typing import Dict, List, Union, Tuple |
| | from utils import ner_labels_to_ids, intent_labels_to_ids, structure_data |
| |
|
| | class tokenized_dataset(Dataset): |
| | """ |
| | A Pytorch Dataset for tokenizing and encoding text data for a BERT-based model. |
| | |
| | Args: |
| | dataset (dict): A dictionary containing 'text', 'ner', and 'intent' keys. |
| | tokenizer (BertTokenizerFast): A tokenizer for processing text input. |
| | max_len (int, optionl): Maximum length of tokenized sequences (default: 128). |
| | |
| | Attributes: |
| | len (int): Number of samples in the dataset. |
| | |
| | Methods: |
| | __getitem__(self, index: int) -> Dict[str, torch.Tensor]: |
| | Retrieve and preprocess a single sample from the dataset. |
| | |
| | __len__(self) -> int: |
| | Get the total number of samples int the dataset. |
| | |
| | Returns: |
| | Dict[str, torch.Tensor]: A dictionary containing tokenized and encoded text, NER and intent labels. |
| | """ |
| | def __init__(self, dataset: Dict[str, List[str]], tokenizer: BertTokenizerFast, max_len: int = 128): |
| | self.len = len(dataset['text']) |
| | self.ner_labels_to_ids = ner_labels_to_ids() |
| | self.intent_labels_to_ids = intent_labels_to_ids() |
| | self.text = dataset['text'] |
| | self.intent = dataset['intent'] |
| | self.ner = dataset['entities'] |
| | self.tokenizer = tokenizer() |
| | self.max_len = max_len |
| |
|
| | def __getitem__(self, index: int) -> Dict[str, torch.Tensor]: |
| | |
| | sentence = self.text[index].strip() |
| | intent_label = self.intent[index].strip() |
| | ner_labels = self.ner[index] |
| |
|
| | |
| | |
| | encoding = self.tokenizer( |
| | sentence, |
| | return_offsets_mapping=True, |
| | padding='max_length', |
| | truncation=True, |
| | max_length=self.max_len |
| | ) |
| |
|
| | |
| | tokenized_ner_labels = [self.ner_labels_to_ids[label] for label in ner_labels] |
| | |
| | encoded_ner_labels = np.ones(len(encoding['offset_mapping']), dtype=int) * -100 |
| |
|
| | |
| | i = 0 |
| | prev = -1 |
| | for idx, mapping in enumerate(encoding['offset_mapping']): |
| | if mapping[0] == mapping[1] == 0: |
| | continue |
| | if mapping[0] != prev: |
| | |
| | encoded_ner_labels[idx] = tokenized_ner_labels[i] |
| | prev = mapping[1] |
| | i += 1 |
| | else: |
| | prev = mapping[1] |
| |
|
| | |
| | tokenized_intent_label = self.intent_labels_to_ids[intent_label] |
| |
|
| | |
| | item = {key: torch.as_tensor(val) for key, val in encoding.items()} |
| | item['ner_labels'] = torch.as_tensor(encoded_ner_labels) |
| | item['intent_labels'] = torch.as_tensor(tokenized_intent_label) |
| |
|
| | return item |
| |
|
| | def __len__(self) -> int: |
| | return self.len |
| | |
| |
|