File size: 10,396 Bytes
ba7da8c 2dc819c ba7da8c 2dc819c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 | # Intended Use
## Data Preparation
### 1. Load the datasets
Load `train.jsonl`, `validation.jsonl`, and `test.jsonl` splits.
```python
raw_datasets = load_dataset("andreaceto/hasd")
```
---
### 2. Create Label Mappings
Now, we create the mappings from string labels (e.g., "schedule", "practitioner_name") to integer IDs. This is essential for training. We also need to create tags for the BIO (Beginning, Inside, Outside) entity scheme.
```python
# --- Create Intent Label Mappings ---
# Get all unique intent labels from the training data
intent_labels = raw_datasets['train'].unique('intent')
intent_labels.sort() # Sort for consistency
id2intent = {i: label for i, label in enumerate(intent_labels)}
intent2id = {label: i for i, label in enumerate(intent_labels)}
print(f"Intent mapping (intent2id): {intent2id}\n")
# --- Create Entity (NER) Label Mappings in BIO format ---
# Get all unique entity labels
entity_labels = ["appointment_id", "appointment_type", "practitioner_name"]
# Create the full list of BIO tags
ner_tags = ["O"] # 'O' for tokens outside any entity
for label in entity_labels:
ner_tags.append(f"B-{label}") # 'B' for Beginning of an entity
ner_tags.append(f"I-{label}") # 'I' for Inside of an entity
id2ner = {i: label for i, label in enumerate(ner_tags)}
ner2id = {label: i for i, label in enumerate(ner_tags)}
print(f"NER mapping (ner2id): {ner2id}")
```
---
### 3. Preprocessing function
This is the core function. It takes a single data example and does two things:
1. Tokenizes the text.
2. Aligns character-based entity spans (`start`, `end`) with the new wordpiece tokens, assigning the correct BIO tag ID to each token.
```python
def preprocess_function(examples):
# --- Intent Processing ---
# Convert intent strings to integer IDs
intent_ids = [intent2id[intent] for intent in examples['intent']]
# --- Tokenization ---
# Tokenize the text. `truncation=True` and `padding` are handled by the Trainer later.
tokenized_inputs = tokenizer(examples['text'], truncation=True, is_split_into_words=False, return_offsets_mapping=True)
# --- Entity (NER) Label Alignment ---
ner_labels = []
for i, entities in enumerate(examples['entities']):
word_ids = tokenized_inputs.word_ids(batch_index=i)
# Start with all tokens labeled as 'O' (Outside)
label_ids = [ner2id["O"]] * len(word_ids)
# For each entity, find the corresponding tokens and assign B- and I- tags
for entity in entities:
start_char, end_char, label = entity['start'], entity['end'], entity['label']
for j, word_id in enumerate(word_ids):
if word_id is None:
continue
# Get the character span for the current token
token_char_span = tokenized_inputs['offset_mapping'][i][j]
if token_char_span is None:
continue
token_start, token_end = token_char_span
# Check if the token is part of the entity
if start_char < token_end and end_char > token_start:
if label_ids[j] == ner2id["O"]:
# Assign the 'B-' tag to the first token
label_ids[j] = ner2id[f"B-{label}"]
else:
# Assign the 'I-' tag to subsequent tokens within the same entity
label_ids[j] = ner2id[f"I-{label}"]
ner_labels.append(label_ids)
# Add the final processed labels to our tokenized inputs
tokenized_inputs["intent_label"] = intent_ids
tokenized_inputs["labels"] = ner_labels
# Remove offset_mapping
tokenized_inputs.pop("offset_mapping", None)
return tokenized_inputs
```
---
### 4. Apply Preprocessing and Save
Now we apply this function to our entire dataset and save the final, processed version.
```python
# Apply the function to all splits of the dataset
processed_datasets = raw_datasets.map(preprocess_function, batched=True, remove_columns=raw_datasets['train'].column_names)
# Define the features for our processed dataset, including the new ClassLabels
features = Features({
'input_ids': Sequence(Value('int64')),
'attention_mask': Sequence(Value('int8')),
'intent_label': ClassLabel(names=list(intent2id.keys())),
'labels': Sequence(ClassLabel(names=list(ner2id.keys())))
})
# Cast the processed datasets to the defined features to include the label names
processed_datasets = processed_datasets.cast(features)
```
This first four steps are essential for model training\fine-tuning.
For model inference you will need to execute the same steps on new input text.
---
## Multitask Model
### 1. Multitask Model class
To use the model you will need to define a `multitask_model.py` with the custom model class built upon our base model.
```python
from transformers import AutoModel, PreTrainedModel
import torch.nn as nn
class MultitaskModel(PreTrainedModel):
"""
A custom Transformer model with two heads: one for intent classification
and one for named entity recognition (token classification).
"""
def __init__(self, config, num_intent_labels: int, num_ner_labels: int):
super().__init__(config)
self.num_intent_labels = num_intent_labels
self.num_ner_labels = num_ner_labels
# Load the base transformer model (e.g., DistilBERT)
self.transformer = AutoModel.from_config(config)
# --- Heads ---
# 1. Intent Classification Head (MLP)
self.intent_classifier = nn.Sequential(
nn.Linear(config.dim, config.dim // 2),
nn.GELU(), # GELU is a smooth activation function, common in Transformers
nn.Dropout(0.3),
nn.Linear(config.dim // 2, self.num_intent_labels)
)
# 2. NER (Token Classification) Head (MLP)
self.ner_classifier = nn.Sequential(
nn.Linear(config.dim, config.dim // 2),
nn.GELU(),
nn.Dropout(0.3),
nn.Linear(config.dim // 2, self.num_ner_labels)
)
# Dropout layer for regularization
self.dropout = nn.Dropout(config.seq_classif_dropout)
def forward(
self,
input_ids=None,
attention_mask=None,
intent_label=None, # For calculating intent loss
labels=None, # For calculating NER loss
):
# Get the last hidden states from the base transformer model
outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
sequence_output = outputs.last_hidden_state # Shape: (batch_size, sequence_length, hidden_size)
# --- Intent Logits ---
# Use the [CLS] token's output for intent classification
cls_token_output = sequence_output[:, 0, :]
cls_token_output = self.dropout(cls_token_output)
intent_logits = self.intent_classifier(cls_token_output)
# --- NER Logits ---
# Use all token outputs for NER
sequence_output = self.dropout(sequence_output)
ner_logits = self.ner_classifier(sequence_output)
# --- Calculate Combined Loss ---
total_loss = 0
if intent_label is not None and labels is not None:
loss_fct = nn.CrossEntropyLoss()
# Intent loss
intent_loss = loss_fct(intent_logits.view(-1, self.num_intent_labels), intent_label.view(-1))
# NER loss (ignore padding tokens with label -100)
ner_loss = loss_fct(ner_logits.view(-1, self.num_ner_labels), labels.view(-1))
# Combine the losses (you can also weight them if one task is more important)
total_loss = intent_loss + ner_loss
return {
"loss": total_loss,
"intent_logits": intent_logits,
"ner_logits": ner_logits,
}
```
---
### 2. Load Tokenizer
```
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
```
---
### 3. Custom Metrics Function
This function is essential for a multitask model. It will be called by the Trainer at the end of each epoch to calculate both intent accuracy and NER F1-score.
```python
def compute_metrics(eval_pred):
# Unpack predictions and labels
predictions, label_values = eval_pred
intent_preds, ner_preds = predictions
intent_labels, ner_labels = label_values
# --- Intent Metrics ---
intent_preds = np.argmax(intent_preds, axis=1)
intent_accuracy = accuracy_score(intent_labels, intent_preds)
intent_f1 = f1_score(intent_labels, intent_preds, average='weighted')
# --- NER Metrics ---
ner_preds = np.argmax(ner_preds, axis=2)
# Remove padding tokens (where label is -100) and convert IDs to labels
true_ner_labels = []
true_ner_predictions = []
id2ner = processed_datasets['train'].features['labels'].feature.names
for i in range(len(ner_labels)):
true_labels_row = []
true_predictions_row = []
for j in range(len(ner_labels[i])):
if ner_labels[i][j] != -100:
true_labels_row.append(id2ner[ner_labels[i][j]])
true_predictions_row.append(id2ner[ner_preds[i][j]])
true_ner_labels.append(true_labels_row)
true_ner_predictions.append(true_predictions_row)
ner_f1 = ner_f1_score(true_ner_labels, true_ner_predictions, mode='strict', scheme=IOB2)
return {
"intent_accuracy": intent_accuracy,
"intent_f1": intent_f1,
"ner_f1": ner_f1
}
```
---
### 4. Instantiate the model
We now create an instance of our `MultitaskModel`, passing it a configuration object that includes the number of labels for each head.
```
# Get label mappings from the dataset features
id2intent = processed_datasets['train'].features['intent_label'].names
intent2id = {name: i for i, name in enumerate(id2intent)}
id2ner = processed_datasets['train'].features['labels'].feature.names
ner2id = {name: i for i, name in enumerate(id2ner)}
# Load the model config and add our custom parameters
config = AutoConfig.from_pretrained(
model_name,
id2label_intent=id2intent,
label2id_intent=intent2id,
id2label_ner=id2ner,
label2id_ner=ner2id
)
# Instantiate our custom model with the new config
model = MultitaskModel(config, num_intent_labels=len(id2intent), num_ner_labels=len(id2ner))
```
|