File size: 10,396 Bytes
ba7da8c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2dc819c
 
 
ba7da8c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2dc819c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
# Intended Use

## Data Preparation

### 1. Load the datasets
Load `train.jsonl`, `validation.jsonl`, and `test.jsonl` splits.

```python
raw_datasets = load_dataset("andreaceto/hasd")
```

---

### 2. Create Label Mappings
Now, we create the mappings from string labels (e.g., "schedule", "practitioner_name") to integer IDs. This is essential for training. We also need to create tags for the BIO (Beginning, Inside, Outside) entity scheme.

```python
# --- Create Intent Label Mappings ---
# Get all unique intent labels from the training data
intent_labels = raw_datasets['train'].unique('intent')
intent_labels.sort() # Sort for consistency
id2intent = {i: label for i, label in enumerate(intent_labels)}
intent2id = {label: i for i, label in enumerate(intent_labels)}
print(f"Intent mapping (intent2id): {intent2id}\n")


# --- Create Entity (NER) Label Mappings in BIO format ---
# Get all unique entity labels
entity_labels = ["appointment_id", "appointment_type", "practitioner_name"]
# Create the full list of BIO tags
ner_tags = ["O"] # 'O' for tokens outside any entity
for label in entity_labels:
    ner_tags.append(f"B-{label}") # 'B' for Beginning of an entity
    ner_tags.append(f"I-{label}") # 'I' for Inside of an entity

id2ner = {i: label for i, label in enumerate(ner_tags)}
ner2id = {label: i for i, label in enumerate(ner_tags)}
print(f"NER mapping (ner2id): {ner2id}")
```

---

### 3. Preprocessing function
This is the core function. It takes a single data example and does two things:
1. Tokenizes the text.
2. Aligns character-based entity spans (`start`, `end`) with the new wordpiece tokens, assigning the correct BIO tag ID to each token.

```python
def preprocess_function(examples):
    # --- Intent Processing ---
    # Convert intent strings to integer IDs
    intent_ids = [intent2id[intent] for intent in examples['intent']]

    # --- Tokenization ---
    # Tokenize the text. `truncation=True` and `padding` are handled by the Trainer later.
    tokenized_inputs = tokenizer(examples['text'], truncation=True, is_split_into_words=False, return_offsets_mapping=True)

    # --- Entity (NER) Label Alignment ---
    ner_labels = []
    for i, entities in enumerate(examples['entities']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)

        # Start with all tokens labeled as 'O' (Outside)
        label_ids = [ner2id["O"]] * len(word_ids)

        # For each entity, find the corresponding tokens and assign B- and I- tags
        for entity in entities:
            start_char, end_char, label = entity['start'], entity['end'], entity['label']

            for j, word_id in enumerate(word_ids):
                if word_id is None:
                    continue

                # Get the character span for the current token
                token_char_span = tokenized_inputs['offset_mapping'][i][j]
                if token_char_span is None:
                    continue

                token_start, token_end = token_char_span

                # Check if the token is part of the entity
                if start_char < token_end and end_char > token_start:
                    if label_ids[j] == ner2id["O"]:
                        # Assign the 'B-' tag to the first token
                        label_ids[j] = ner2id[f"B-{label}"]
                    else:
                        # Assign the 'I-' tag to subsequent tokens within the same entity
                        label_ids[j] = ner2id[f"I-{label}"]

        ner_labels.append(label_ids)

    # Add the final processed labels to our tokenized inputs
    tokenized_inputs["intent_label"] = intent_ids
    tokenized_inputs["labels"] = ner_labels

	# Remove offset_mapping
    tokenized_inputs.pop("offset_mapping", None)

    return tokenized_inputs
```

---

### 4. Apply Preprocessing and Save
Now we apply this function to our entire dataset and save the final, processed version.

```python
# Apply the function to all splits of the dataset
processed_datasets = raw_datasets.map(preprocess_function, batched=True, remove_columns=raw_datasets['train'].column_names)

# Define the features for our processed dataset, including the new ClassLabels
features = Features({
    'input_ids': Sequence(Value('int64')),
    'attention_mask': Sequence(Value('int8')),
    'intent_label': ClassLabel(names=list(intent2id.keys())),
    'labels': Sequence(ClassLabel(names=list(ner2id.keys())))
})

# Cast the processed datasets to the defined features to include the label names
processed_datasets = processed_datasets.cast(features)
```

This first four steps are essential for model training\fine-tuning.

For model inference you will need to execute the same steps on new input text.

---

## Multitask Model

### 1. Multitask Model class
To use the model you will need to define a `multitask_model.py` with the custom model class built upon our base model.

```python
from transformers import AutoModel, PreTrainedModel
import torch.nn as nn


class MultitaskModel(PreTrainedModel):
    """
    A custom Transformer model with two heads: one for intent classification
    and one for named entity recognition (token classification).
    """
    def __init__(self, config, num_intent_labels: int, num_ner_labels: int):
        super().__init__(config)
        self.num_intent_labels = num_intent_labels
        self.num_ner_labels = num_ner_labels

        # Load the base transformer model (e.g., DistilBERT)
        self.transformer = AutoModel.from_config(config)

        # --- Heads ---
        # 1. Intent Classification Head (MLP)
        self.intent_classifier = nn.Sequential(
            nn.Linear(config.dim, config.dim // 2),
            nn.GELU(), # GELU is a smooth activation function, common in Transformers
            nn.Dropout(0.3),
            nn.Linear(config.dim // 2, self.num_intent_labels)
        )

        # 2. NER (Token Classification) Head (MLP)
        self.ner_classifier = nn.Sequential(
            nn.Linear(config.dim, config.dim // 2),
            nn.GELU(),
            nn.Dropout(0.3),
            nn.Linear(config.dim // 2, self.num_ner_labels)
        )

        # Dropout layer for regularization
        self.dropout = nn.Dropout(config.seq_classif_dropout)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        intent_label=None, # For calculating intent loss
        labels=None,    # For calculating NER loss
    ):
        # Get the last hidden states from the base transformer model
        outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state # Shape: (batch_size, sequence_length, hidden_size)

        # --- Intent Logits ---
        # Use the [CLS] token's output for intent classification
        cls_token_output = sequence_output[:, 0, :]
        cls_token_output = self.dropout(cls_token_output)
        intent_logits = self.intent_classifier(cls_token_output)

        # --- NER Logits ---
        # Use all token outputs for NER
        sequence_output = self.dropout(sequence_output)
        ner_logits = self.ner_classifier(sequence_output)

        # --- Calculate Combined Loss ---
        total_loss = 0
        if intent_label is not None and labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            # Intent loss
            intent_loss = loss_fct(intent_logits.view(-1, self.num_intent_labels), intent_label.view(-1))
            # NER loss (ignore padding tokens with label -100)
            ner_loss = loss_fct(ner_logits.view(-1, self.num_ner_labels), labels.view(-1))
            # Combine the losses (you can also weight them if one task is more important)
            total_loss = intent_loss + ner_loss

        return {
            "loss": total_loss,
            "intent_logits": intent_logits,
            "ner_logits": ner_logits,
        }
```

---

### 2. Load Tokenizer

```
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
```

---

### 3. Custom Metrics Function
This function is essential for a multitask model. It will be called by the Trainer at the end of each epoch to calculate both intent accuracy and NER F1-score.

```python
def compute_metrics(eval_pred):
    # Unpack predictions and labels
    predictions, label_values = eval_pred
    intent_preds, ner_preds = predictions
    intent_labels, ner_labels = label_values

    # --- Intent Metrics ---
    intent_preds = np.argmax(intent_preds, axis=1)
    intent_accuracy = accuracy_score(intent_labels, intent_preds)
    intent_f1 = f1_score(intent_labels, intent_preds, average='weighted')

    # --- NER Metrics ---
    ner_preds = np.argmax(ner_preds, axis=2)

    # Remove padding tokens (where label is -100) and convert IDs to labels
    true_ner_labels = []
    true_ner_predictions = []
    id2ner = processed_datasets['train'].features['labels'].feature.names

    for i in range(len(ner_labels)):
        true_labels_row = []
        true_predictions_row = []
        for j in range(len(ner_labels[i])):
            if ner_labels[i][j] != -100:
                true_labels_row.append(id2ner[ner_labels[i][j]])
                true_predictions_row.append(id2ner[ner_preds[i][j]])
        true_ner_labels.append(true_labels_row)
        true_ner_predictions.append(true_predictions_row)

    ner_f1 = ner_f1_score(true_ner_labels, true_ner_predictions, mode='strict', scheme=IOB2)

    return {
        "intent_accuracy": intent_accuracy,
        "intent_f1": intent_f1,
        "ner_f1": ner_f1
    }
```

---

### 4. Instantiate the model
We now create an instance of our `MultitaskModel`, passing it a configuration object that includes the number of labels for each head.

```
# Get label mappings from the dataset features
id2intent = processed_datasets['train'].features['intent_label'].names
intent2id = {name: i for i, name in enumerate(id2intent)}
id2ner = processed_datasets['train'].features['labels'].feature.names
ner2id = {name: i for i, name in enumerate(id2ner)}

# Load the model config and add our custom parameters
config = AutoConfig.from_pretrained(
    model_name,
    id2label_intent=id2intent,
    label2id_intent=intent2id,
    id2label_ner=id2ner,
    label2id_ner=ner2id
)

# Instantiate our custom model with the new config
model = MultitaskModel(config, num_intent_labels=len(id2intent), num_ner_labels=len(id2ner))
```