Update how_to_use.md
Browse files- how_to_use.md +93 -48
how_to_use.md
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
#
|
| 2 |
|
| 3 |
## Data Preparation
|
| 4 |
|
|
@@ -48,43 +48,40 @@ This is the core function. It takes a single data example and does two things:
|
|
| 48 |
```python
|
| 49 |
def preprocess_function(examples):
|
| 50 |
# --- Intent Processing ---
|
| 51 |
-
# Convert intent strings to integer IDs
|
| 52 |
intent_ids = [intent2id[intent] for intent in examples['intent']]
|
| 53 |
|
| 54 |
# --- Tokenization ---
|
| 55 |
-
# Tokenize the text. `truncation=True` and `padding` are handled by the Trainer later.
|
| 56 |
tokenized_inputs = tokenizer(examples['text'], truncation=True, is_split_into_words=False, return_offsets_mapping=True)
|
| 57 |
|
| 58 |
# --- Entity (NER) Label Alignment ---
|
| 59 |
ner_labels = []
|
| 60 |
for i, entities in enumerate(examples['entities']):
|
| 61 |
word_ids = tokenized_inputs.word_ids(batch_index=i)
|
| 62 |
-
|
| 63 |
-
# Start with all tokens labeled as 'O' (Outside)
|
| 64 |
label_ids = [ner2id["O"]] * len(word_ids)
|
| 65 |
-
|
| 66 |
# For each entity, find the corresponding tokens and assign B- and I- tags
|
| 67 |
for entity in entities:
|
| 68 |
start_char, end_char, label = entity['start'], entity['end'], entity['label']
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
| 70 |
for j, word_id in enumerate(word_ids):
|
| 71 |
if word_id is None:
|
| 72 |
continue
|
| 73 |
-
|
| 74 |
-
# Get the character span for the current token
|
| 75 |
token_char_span = tokenized_inputs['offset_mapping'][i][j]
|
| 76 |
-
if token_char_span is None:
|
| 77 |
-
continue
|
| 78 |
-
|
| 79 |
token_start, token_end = token_char_span
|
| 80 |
-
|
| 81 |
# Check if the token is part of the entity
|
| 82 |
if start_char < token_end and end_char > token_start:
|
| 83 |
-
|
| 84 |
-
|
|
|
|
| 85 |
label_ids[j] = ner2id[f"B-{label}"]
|
|
|
|
| 86 |
else:
|
| 87 |
-
#
|
| 88 |
label_ids[j] = ner2id[f"I-{label}"]
|
| 89 |
|
| 90 |
ner_labels.append(label_ids)
|
|
@@ -92,10 +89,10 @@ def preprocess_function(examples):
|
|
| 92 |
# Add the final processed labels to our tokenized inputs
|
| 93 |
tokenized_inputs["intent_label"] = intent_ids
|
| 94 |
tokenized_inputs["labels"] = ner_labels
|
| 95 |
-
|
| 96 |
-
|
| 97 |
tokenized_inputs.pop("offset_mapping", None)
|
| 98 |
-
|
| 99 |
return tokenized_inputs
|
| 100 |
```
|
| 101 |
|
|
@@ -120,10 +117,6 @@ features = Features({
|
|
| 120 |
processed_datasets = processed_datasets.cast(features)
|
| 121 |
```
|
| 122 |
|
| 123 |
-
This first four steps are essential for model training\fine-tuning.
|
| 124 |
-
|
| 125 |
-
For model inference you will need to execute the same steps on new input text.
|
| 126 |
-
|
| 127 |
---
|
| 128 |
|
| 129 |
## Multitask Model
|
|
@@ -135,16 +128,15 @@ To use the model you will need to define a `multitask_model.py` with the custom
|
|
| 135 |
from transformers import AutoModel, PreTrainedModel
|
| 136 |
import torch.nn as nn
|
| 137 |
|
| 138 |
-
|
| 139 |
class MultitaskModel(PreTrainedModel):
|
| 140 |
"""
|
| 141 |
A custom Transformer model with two heads: one for intent classification
|
| 142 |
and one for named entity recognition (token classification).
|
| 143 |
"""
|
| 144 |
-
|
|
|
|
|
|
|
| 145 |
super().__init__(config)
|
| 146 |
-
self.num_intent_labels = num_intent_labels
|
| 147 |
-
self.num_ner_labels = num_ner_labels
|
| 148 |
|
| 149 |
# Load the base transformer model (e.g., DistilBERT)
|
| 150 |
self.transformer = AutoModel.from_config(config)
|
|
@@ -155,7 +147,7 @@ class MultitaskModel(PreTrainedModel):
|
|
| 155 |
nn.Linear(config.dim, config.dim // 2),
|
| 156 |
nn.GELU(), # GELU is a smooth activation function, common in Transformers
|
| 157 |
nn.Dropout(0.3),
|
| 158 |
-
nn.Linear(config.dim // 2, self.num_intent_labels)
|
| 159 |
)
|
| 160 |
|
| 161 |
# 2. NER (Token Classification) Head (MLP)
|
|
@@ -163,7 +155,7 @@ class MultitaskModel(PreTrainedModel):
|
|
| 163 |
nn.Linear(config.dim, config.dim // 2),
|
| 164 |
nn.GELU(),
|
| 165 |
nn.Dropout(0.3),
|
| 166 |
-
nn.Linear(config.dim // 2, self.num_ner_labels)
|
| 167 |
)
|
| 168 |
|
| 169 |
# Dropout layer for regularization
|
|
@@ -196,9 +188,9 @@ class MultitaskModel(PreTrainedModel):
|
|
| 196 |
if intent_label is not None and labels is not None:
|
| 197 |
loss_fct = nn.CrossEntropyLoss()
|
| 198 |
# Intent loss
|
| 199 |
-
intent_loss = loss_fct(intent_logits.view(-1, self.num_intent_labels), intent_label.view(-1))
|
| 200 |
# NER loss (ignore padding tokens with label -100)
|
| 201 |
-
ner_loss = loss_fct(ner_logits.view(-1, self.num_ner_labels), labels.view(-1))
|
| 202 |
# Combine the losses (you can also weight them if one task is more important)
|
| 203 |
total_loss = intent_loss + ner_loss
|
| 204 |
|
|
@@ -268,21 +260,74 @@ def compute_metrics(eval_pred):
|
|
| 268 |
We now create an instance of our `MultitaskModel`, passing it a configuration object that includes the number of labels for each head.
|
| 269 |
|
| 270 |
```
|
| 271 |
-
# Get
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
#
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
#
|
| 287 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Training/Fine-tuning
|
| 2 |
|
| 3 |
## Data Preparation
|
| 4 |
|
|
|
|
| 48 |
```python
|
| 49 |
def preprocess_function(examples):
|
| 50 |
# --- Intent Processing ---
|
|
|
|
| 51 |
intent_ids = [intent2id[intent] for intent in examples['intent']]
|
| 52 |
|
| 53 |
# --- Tokenization ---
|
|
|
|
| 54 |
tokenized_inputs = tokenizer(examples['text'], truncation=True, is_split_into_words=False, return_offsets_mapping=True)
|
| 55 |
|
| 56 |
# --- Entity (NER) Label Alignment ---
|
| 57 |
ner_labels = []
|
| 58 |
for i, entities in enumerate(examples['entities']):
|
| 59 |
word_ids = tokenized_inputs.word_ids(batch_index=i)
|
|
|
|
|
|
|
| 60 |
label_ids = [ner2id["O"]] * len(word_ids)
|
| 61 |
+
|
| 62 |
# For each entity, find the corresponding tokens and assign B- and I- tags
|
| 63 |
for entity in entities:
|
| 64 |
start_char, end_char, label = entity['start'], entity['end'], entity['label']
|
| 65 |
+
|
| 66 |
+
# This flag tracks if we've found the first token of the current entity
|
| 67 |
+
first_token_of_entity_found = False
|
| 68 |
+
|
| 69 |
for j, word_id in enumerate(word_ids):
|
| 70 |
if word_id is None:
|
| 71 |
continue
|
| 72 |
+
|
|
|
|
| 73 |
token_char_span = tokenized_inputs['offset_mapping'][i][j]
|
|
|
|
|
|
|
|
|
|
| 74 |
token_start, token_end = token_char_span
|
| 75 |
+
|
| 76 |
# Check if the token is part of the entity
|
| 77 |
if start_char < token_end and end_char > token_start:
|
| 78 |
+
# This is the key change. We use the flag to decide the tag.
|
| 79 |
+
if not first_token_of_entity_found:
|
| 80 |
+
# This is the first token of the entity, assign the 'B-' tag
|
| 81 |
label_ids[j] = ner2id[f"B-{label}"]
|
| 82 |
+
first_token_of_entity_found = True
|
| 83 |
else:
|
| 84 |
+
# This is a subsequent token of the same entity, assign 'I-'
|
| 85 |
label_ids[j] = ner2id[f"I-{label}"]
|
| 86 |
|
| 87 |
ner_labels.append(label_ids)
|
|
|
|
| 89 |
# Add the final processed labels to our tokenized inputs
|
| 90 |
tokenized_inputs["intent_label"] = intent_ids
|
| 91 |
tokenized_inputs["labels"] = ner_labels
|
| 92 |
+
|
| 93 |
+
# Remove offset_mapping as it's not needed by the model
|
| 94 |
tokenized_inputs.pop("offset_mapping", None)
|
| 95 |
+
|
| 96 |
return tokenized_inputs
|
| 97 |
```
|
| 98 |
|
|
|
|
| 117 |
processed_datasets = processed_datasets.cast(features)
|
| 118 |
```
|
| 119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
---
|
| 121 |
|
| 122 |
## Multitask Model
|
|
|
|
| 128 |
from transformers import AutoModel, PreTrainedModel
|
| 129 |
import torch.nn as nn
|
| 130 |
|
|
|
|
| 131 |
class MultitaskModel(PreTrainedModel):
|
| 132 |
"""
|
| 133 |
A custom Transformer model with two heads: one for intent classification
|
| 134 |
and one for named entity recognition (token classification).
|
| 135 |
"""
|
| 136 |
+
config_class = AutoConfig
|
| 137 |
+
|
| 138 |
+
def __init__(self, config):
|
| 139 |
super().__init__(config)
|
|
|
|
|
|
|
| 140 |
|
| 141 |
# Load the base transformer model (e.g., DistilBERT)
|
| 142 |
self.transformer = AutoModel.from_config(config)
|
|
|
|
| 147 |
nn.Linear(config.dim, config.dim // 2),
|
| 148 |
nn.GELU(), # GELU is a smooth activation function, common in Transformers
|
| 149 |
nn.Dropout(0.3),
|
| 150 |
+
nn.Linear(config.dim // 2, self.config.num_intent_labels)
|
| 151 |
)
|
| 152 |
|
| 153 |
# 2. NER (Token Classification) Head (MLP)
|
|
|
|
| 155 |
nn.Linear(config.dim, config.dim // 2),
|
| 156 |
nn.GELU(),
|
| 157 |
nn.Dropout(0.3),
|
| 158 |
+
nn.Linear(config.dim // 2, self.config.num_ner_labels)
|
| 159 |
)
|
| 160 |
|
| 161 |
# Dropout layer for regularization
|
|
|
|
| 188 |
if intent_label is not None and labels is not None:
|
| 189 |
loss_fct = nn.CrossEntropyLoss()
|
| 190 |
# Intent loss
|
| 191 |
+
intent_loss = loss_fct(intent_logits.view(-1, self.config.num_intent_labels), intent_label.view(-1))
|
| 192 |
# NER loss (ignore padding tokens with label -100)
|
| 193 |
+
ner_loss = loss_fct(ner_logits.view(-1, self.config.num_ner_labels), labels.view(-1))
|
| 194 |
# Combine the losses (you can also weight them if one task is more important)
|
| 195 |
total_loss = intent_loss + ner_loss
|
| 196 |
|
|
|
|
| 260 |
We now create an instance of our `MultitaskModel`, passing it a configuration object that includes the number of labels for each head.
|
| 261 |
|
| 262 |
```
|
| 263 |
+
# --- Get DistilBert config file ---
|
| 264 |
+
config = AutoConfig.from_pretrained(model_name)
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
# --- Create Label Mappings Directly from Data ---
|
| 268 |
+
# 1. Intent Labels
|
| 269 |
+
# Get a sorted list of unique intent strings from the training set
|
| 270 |
+
intent_label_list = processed_datasets['train'].features['intent_label'].names
|
| 271 |
+
# Create the mappings
|
| 272 |
+
id2intent = {i: label for i, label in enumerate(intent_label_list)}
|
| 273 |
+
intent2id = {label: i for i, label in enumerate(intent_label_list)}
|
| 274 |
+
|
| 275 |
+
# 2. NER Labels
|
| 276 |
+
# Get a sorted list of unique entities strings from the training set
|
| 277 |
+
ner_label_list = processed_datasets['train'].features['labels'].feature.names
|
| 278 |
+
# Create the mappings
|
| 279 |
+
id2ner = {i: label for i, label in enumerate(ner_label_list)}
|
| 280 |
+
ner2id = {label: i for i, label in enumerate(ner_label_list)}
|
| 281 |
+
|
| 282 |
+
# --- Add custom parameters to config object ---
|
| 283 |
+
config.num_intent_labels = len(id2intent)
|
| 284 |
+
config.num_ner_labels = len(id2ner)
|
| 285 |
+
config.id2label_intent = id2intent
|
| 286 |
+
config.label2id_intent = intent2id
|
| 287 |
+
config.id2label_ner = id2ner
|
| 288 |
+
config.label2id_ner = ner2id
|
| 289 |
+
|
| 290 |
+
# --- Finally instantiate the model ---
|
| 291 |
+
model = MultitaskModel(config)
|
| 292 |
+
```
|
| 293 |
+
|
| 294 |
+
---
|
| 295 |
+
|
| 296 |
+
# Inference
|
| 297 |
+
|
| 298 |
+
## Load the Model
|
| 299 |
+
Load the trained tokenizer and model:
|
| 300 |
+
```
|
| 301 |
+
tokenizer = AutoTokenizer.from_pretrained("andreaceto/schedulebot-nlu-engine")
|
| 302 |
+
multitask_model = MultitaskModel.from_pretrained("andreaceto/schedulebot-nlu-engine")
|
| 303 |
```
|
| 304 |
+
|
| 305 |
+
---
|
| 306 |
+
|
| 307 |
+
## Preprocess raw text
|
| 308 |
+
In order to being able to run inference on the model, it's necessary to use the tokenizer on the raw text in input:
|
| 309 |
+
|
| 310 |
+
```
|
| 311 |
+
inputs = self.tokenizer(text, return_tensors="pt")
|
| 312 |
+
```
|
| 313 |
+
|
| 314 |
+
---
|
| 315 |
+
|
| 316 |
+
## Get predictions
|
| 317 |
+
Now you can run inference using:
|
| 318 |
+
|
| 319 |
+
```
|
| 320 |
+
with torch.no_grad():
|
| 321 |
+
outputs = self.multitask_model(**inputs)
|
| 322 |
+
```
|
| 323 |
+
|
| 324 |
+
Since the model returns a dictionary, you can access logits by using:
|
| 325 |
+
|
| 326 |
+
```
|
| 327 |
+
intent_logits = outputs["intent_logits"]
|
| 328 |
+
ner_logits = outputs["ner_logits"]
|
| 329 |
+
```
|
| 330 |
+
|
| 331 |
+
You can now extract prediction using `torch.argmax()` and converting the result into categorical by using the dictionaries in the config file (`id2label_intent` and `id2label_ner`).
|
| 332 |
+
|
| 333 |
+
**N.B.**: Since SON format specifications require that all keys in an object be strings. When the transformers library saves your configuration, it correctly converts your integer keys (0, 1, 2, etc.) into strings ("0", "1", "2"), which causes a mismatch when trying to use extracted prediction with `torch.argmax()` as a key in the config dictionaries. The solution is to cast the `int` value of prediction as a string with `str()`.
|