andreaceto commited on
Commit
29fe96a
·
verified ·
1 Parent(s): c236fd6

Update how_to_use.md

Browse files
Files changed (1) hide show
  1. how_to_use.md +93 -48
how_to_use.md CHANGED
@@ -1,4 +1,4 @@
1
- # Intended Use
2
 
3
  ## Data Preparation
4
 
@@ -48,43 +48,40 @@ This is the core function. It takes a single data example and does two things:
48
  ```python
49
  def preprocess_function(examples):
50
  # --- Intent Processing ---
51
- # Convert intent strings to integer IDs
52
  intent_ids = [intent2id[intent] for intent in examples['intent']]
53
 
54
  # --- Tokenization ---
55
- # Tokenize the text. `truncation=True` and `padding` are handled by the Trainer later.
56
  tokenized_inputs = tokenizer(examples['text'], truncation=True, is_split_into_words=False, return_offsets_mapping=True)
57
 
58
  # --- Entity (NER) Label Alignment ---
59
  ner_labels = []
60
  for i, entities in enumerate(examples['entities']):
61
  word_ids = tokenized_inputs.word_ids(batch_index=i)
62
-
63
- # Start with all tokens labeled as 'O' (Outside)
64
  label_ids = [ner2id["O"]] * len(word_ids)
65
-
66
  # For each entity, find the corresponding tokens and assign B- and I- tags
67
  for entity in entities:
68
  start_char, end_char, label = entity['start'], entity['end'], entity['label']
69
-
 
 
 
70
  for j, word_id in enumerate(word_ids):
71
  if word_id is None:
72
  continue
73
-
74
- # Get the character span for the current token
75
  token_char_span = tokenized_inputs['offset_mapping'][i][j]
76
- if token_char_span is None:
77
- continue
78
-
79
  token_start, token_end = token_char_span
80
-
81
  # Check if the token is part of the entity
82
  if start_char < token_end and end_char > token_start:
83
- if label_ids[j] == ner2id["O"]:
84
- # Assign the 'B-' tag to the first token
 
85
  label_ids[j] = ner2id[f"B-{label}"]
 
86
  else:
87
- # Assign the 'I-' tag to subsequent tokens within the same entity
88
  label_ids[j] = ner2id[f"I-{label}"]
89
 
90
  ner_labels.append(label_ids)
@@ -92,10 +89,10 @@ def preprocess_function(examples):
92
  # Add the final processed labels to our tokenized inputs
93
  tokenized_inputs["intent_label"] = intent_ids
94
  tokenized_inputs["labels"] = ner_labels
95
-
96
- # Remove offset_mapping
97
  tokenized_inputs.pop("offset_mapping", None)
98
-
99
  return tokenized_inputs
100
  ```
101
 
@@ -120,10 +117,6 @@ features = Features({
120
  processed_datasets = processed_datasets.cast(features)
121
  ```
122
 
123
- This first four steps are essential for model training\fine-tuning.
124
-
125
- For model inference you will need to execute the same steps on new input text.
126
-
127
  ---
128
 
129
  ## Multitask Model
@@ -135,16 +128,15 @@ To use the model you will need to define a `multitask_model.py` with the custom
135
  from transformers import AutoModel, PreTrainedModel
136
  import torch.nn as nn
137
 
138
-
139
  class MultitaskModel(PreTrainedModel):
140
  """
141
  A custom Transformer model with two heads: one for intent classification
142
  and one for named entity recognition (token classification).
143
  """
144
- def __init__(self, config, num_intent_labels: int, num_ner_labels: int):
 
 
145
  super().__init__(config)
146
- self.num_intent_labels = num_intent_labels
147
- self.num_ner_labels = num_ner_labels
148
 
149
  # Load the base transformer model (e.g., DistilBERT)
150
  self.transformer = AutoModel.from_config(config)
@@ -155,7 +147,7 @@ class MultitaskModel(PreTrainedModel):
155
  nn.Linear(config.dim, config.dim // 2),
156
  nn.GELU(), # GELU is a smooth activation function, common in Transformers
157
  nn.Dropout(0.3),
158
- nn.Linear(config.dim // 2, self.num_intent_labels)
159
  )
160
 
161
  # 2. NER (Token Classification) Head (MLP)
@@ -163,7 +155,7 @@ class MultitaskModel(PreTrainedModel):
163
  nn.Linear(config.dim, config.dim // 2),
164
  nn.GELU(),
165
  nn.Dropout(0.3),
166
- nn.Linear(config.dim // 2, self.num_ner_labels)
167
  )
168
 
169
  # Dropout layer for regularization
@@ -196,9 +188,9 @@ class MultitaskModel(PreTrainedModel):
196
  if intent_label is not None and labels is not None:
197
  loss_fct = nn.CrossEntropyLoss()
198
  # Intent loss
199
- intent_loss = loss_fct(intent_logits.view(-1, self.num_intent_labels), intent_label.view(-1))
200
  # NER loss (ignore padding tokens with label -100)
201
- ner_loss = loss_fct(ner_logits.view(-1, self.num_ner_labels), labels.view(-1))
202
  # Combine the losses (you can also weight them if one task is more important)
203
  total_loss = intent_loss + ner_loss
204
 
@@ -268,21 +260,74 @@ def compute_metrics(eval_pred):
268
  We now create an instance of our `MultitaskModel`, passing it a configuration object that includes the number of labels for each head.
269
 
270
  ```
271
- # Get label mappings from the dataset features
272
- id2intent = processed_datasets['train'].features['intent_label'].names
273
- intent2id = {name: i for i, name in enumerate(id2intent)}
274
- id2ner = processed_datasets['train'].features['labels'].feature.names
275
- ner2id = {name: i for i, name in enumerate(id2ner)}
276
-
277
- # Load the model config and add our custom parameters
278
- config = AutoConfig.from_pretrained(
279
- model_name,
280
- id2label_intent=id2intent,
281
- label2id_intent=intent2id,
282
- id2label_ner=id2ner,
283
- label2id_ner=ner2id
284
- )
285
-
286
- # Instantiate our custom model with the new config
287
- model = MultitaskModel(config, num_intent_labels=len(id2intent), num_ner_labels=len(id2ner))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Training/Fine-tuning
2
 
3
  ## Data Preparation
4
 
 
48
  ```python
49
  def preprocess_function(examples):
50
  # --- Intent Processing ---
 
51
  intent_ids = [intent2id[intent] for intent in examples['intent']]
52
 
53
  # --- Tokenization ---
 
54
  tokenized_inputs = tokenizer(examples['text'], truncation=True, is_split_into_words=False, return_offsets_mapping=True)
55
 
56
  # --- Entity (NER) Label Alignment ---
57
  ner_labels = []
58
  for i, entities in enumerate(examples['entities']):
59
  word_ids = tokenized_inputs.word_ids(batch_index=i)
 
 
60
  label_ids = [ner2id["O"]] * len(word_ids)
61
+
62
  # For each entity, find the corresponding tokens and assign B- and I- tags
63
  for entity in entities:
64
  start_char, end_char, label = entity['start'], entity['end'], entity['label']
65
+
66
+ # This flag tracks if we've found the first token of the current entity
67
+ first_token_of_entity_found = False
68
+
69
  for j, word_id in enumerate(word_ids):
70
  if word_id is None:
71
  continue
72
+
 
73
  token_char_span = tokenized_inputs['offset_mapping'][i][j]
 
 
 
74
  token_start, token_end = token_char_span
75
+
76
  # Check if the token is part of the entity
77
  if start_char < token_end and end_char > token_start:
78
+ # This is the key change. We use the flag to decide the tag.
79
+ if not first_token_of_entity_found:
80
+ # This is the first token of the entity, assign the 'B-' tag
81
  label_ids[j] = ner2id[f"B-{label}"]
82
+ first_token_of_entity_found = True
83
  else:
84
+ # This is a subsequent token of the same entity, assign 'I-'
85
  label_ids[j] = ner2id[f"I-{label}"]
86
 
87
  ner_labels.append(label_ids)
 
89
  # Add the final processed labels to our tokenized inputs
90
  tokenized_inputs["intent_label"] = intent_ids
91
  tokenized_inputs["labels"] = ner_labels
92
+
93
+ # Remove offset_mapping as it's not needed by the model
94
  tokenized_inputs.pop("offset_mapping", None)
95
+
96
  return tokenized_inputs
97
  ```
98
 
 
117
  processed_datasets = processed_datasets.cast(features)
118
  ```
119
 
 
 
 
 
120
  ---
121
 
122
  ## Multitask Model
 
128
  from transformers import AutoModel, PreTrainedModel
129
  import torch.nn as nn
130
 
 
131
  class MultitaskModel(PreTrainedModel):
132
  """
133
  A custom Transformer model with two heads: one for intent classification
134
  and one for named entity recognition (token classification).
135
  """
136
+ config_class = AutoConfig
137
+
138
+ def __init__(self, config):
139
  super().__init__(config)
 
 
140
 
141
  # Load the base transformer model (e.g., DistilBERT)
142
  self.transformer = AutoModel.from_config(config)
 
147
  nn.Linear(config.dim, config.dim // 2),
148
  nn.GELU(), # GELU is a smooth activation function, common in Transformers
149
  nn.Dropout(0.3),
150
+ nn.Linear(config.dim // 2, self.config.num_intent_labels)
151
  )
152
 
153
  # 2. NER (Token Classification) Head (MLP)
 
155
  nn.Linear(config.dim, config.dim // 2),
156
  nn.GELU(),
157
  nn.Dropout(0.3),
158
+ nn.Linear(config.dim // 2, self.config.num_ner_labels)
159
  )
160
 
161
  # Dropout layer for regularization
 
188
  if intent_label is not None and labels is not None:
189
  loss_fct = nn.CrossEntropyLoss()
190
  # Intent loss
191
+ intent_loss = loss_fct(intent_logits.view(-1, self.config.num_intent_labels), intent_label.view(-1))
192
  # NER loss (ignore padding tokens with label -100)
193
+ ner_loss = loss_fct(ner_logits.view(-1, self.config.num_ner_labels), labels.view(-1))
194
  # Combine the losses (you can also weight them if one task is more important)
195
  total_loss = intent_loss + ner_loss
196
 
 
260
  We now create an instance of our `MultitaskModel`, passing it a configuration object that includes the number of labels for each head.
261
 
262
  ```
263
+ # --- Get DistilBert config file ---
264
+ config = AutoConfig.from_pretrained(model_name)
265
+
266
+
267
+ # --- Create Label Mappings Directly from Data ---
268
+ # 1. Intent Labels
269
+ # Get a sorted list of unique intent strings from the training set
270
+ intent_label_list = processed_datasets['train'].features['intent_label'].names
271
+ # Create the mappings
272
+ id2intent = {i: label for i, label in enumerate(intent_label_list)}
273
+ intent2id = {label: i for i, label in enumerate(intent_label_list)}
274
+
275
+ # 2. NER Labels
276
+ # Get a sorted list of unique entities strings from the training set
277
+ ner_label_list = processed_datasets['train'].features['labels'].feature.names
278
+ # Create the mappings
279
+ id2ner = {i: label for i, label in enumerate(ner_label_list)}
280
+ ner2id = {label: i for i, label in enumerate(ner_label_list)}
281
+
282
+ # --- Add custom parameters to config object ---
283
+ config.num_intent_labels = len(id2intent)
284
+ config.num_ner_labels = len(id2ner)
285
+ config.id2label_intent = id2intent
286
+ config.label2id_intent = intent2id
287
+ config.id2label_ner = id2ner
288
+ config.label2id_ner = ner2id
289
+
290
+ # --- Finally instantiate the model ---
291
+ model = MultitaskModel(config)
292
+ ```
293
+
294
+ ---
295
+
296
+ # Inference
297
+
298
+ ## Load the Model
299
+ Load the trained tokenizer and model:
300
+ ```
301
+ tokenizer = AutoTokenizer.from_pretrained("andreaceto/schedulebot-nlu-engine")
302
+ multitask_model = MultitaskModel.from_pretrained("andreaceto/schedulebot-nlu-engine")
303
  ```
304
+
305
+ ---
306
+
307
+ ## Preprocess raw text
308
+ In order to being able to run inference on the model, it's necessary to use the tokenizer on the raw text in input:
309
+
310
+ ```
311
+ inputs = self.tokenizer(text, return_tensors="pt")
312
+ ```
313
+
314
+ ---
315
+
316
+ ## Get predictions
317
+ Now you can run inference using:
318
+
319
+ ```
320
+ with torch.no_grad():
321
+ outputs = self.multitask_model(**inputs)
322
+ ```
323
+
324
+ Since the model returns a dictionary, you can access logits by using:
325
+
326
+ ```
327
+ intent_logits = outputs["intent_logits"]
328
+ ner_logits = outputs["ner_logits"]
329
+ ```
330
+
331
+ You can now extract prediction using `torch.argmax()` and converting the result into categorical by using the dictionaries in the config file (`id2label_intent` and `id2label_ner`).
332
+
333
+ **N.B.**: Since SON format specifications require that all keys in an object be strings. When the transformers library saves your configuration, it correctly converts your integer keys (0, 1, 2, etc.) into strings ("0", "1", "2"), which causes a mismatch when trying to use extracted prediction with `torch.argmax()` as a key in the config dictionaries. The solution is to cast the `int` value of prediction as a string with `str()`.