andreaceto commited on
Commit
ba7da8c
·
verified ·
1 Parent(s): 7f5a53f

Create how_to_use.md

Browse files
Files changed (1) hide show
  1. how_to_use.md +208 -0
how_to_use.md ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Intended Use
2
+
3
+ ## Data Preparation
4
+
5
+ ### 1. Load the datasets
6
+ Load `train.jsonl`, `validation.jsonl`, and `test.jsonl` splits.
7
+
8
+ ```python
9
+ raw_datasets = load_dataset("andreaceto/hasd")
10
+ ```
11
+
12
+ ---
13
+
14
+ ### 2. Create Label Mappings
15
+ Now, we create the mappings from string labels (e.g., "schedule", "practitioner_name") to integer IDs. This is essential for training. We also need to create tags for the BIO (Beginning, Inside, Outside) entity scheme.
16
+
17
+ ```python
18
+ # --- Create Intent Label Mappings ---
19
+ # Get all unique intent labels from the training data
20
+ intent_labels = raw_datasets['train'].unique('intent')
21
+ intent_labels.sort() # Sort for consistency
22
+ id2intent = {i: label for i, label in enumerate(intent_labels)}
23
+ intent2id = {label: i for i, label in enumerate(intent_labels)}
24
+ print(f"Intent mapping (intent2id): {intent2id}\n")
25
+
26
+
27
+ # --- Create Entity (NER) Label Mappings in BIO format ---
28
+ # Get all unique entity labels
29
+ entity_labels = ["appointment_id", "appointment_type", "practitioner_name"]
30
+ # Create the full list of BIO tags
31
+ ner_tags = ["O"] # 'O' for tokens outside any entity
32
+ for label in entity_labels:
33
+ ner_tags.append(f"B-{label}") # 'B' for Beginning of an entity
34
+ ner_tags.append(f"I-{label}") # 'I' for Inside of an entity
35
+
36
+ id2ner = {i: label for i, label in enumerate(ner_tags)}
37
+ ner2id = {label: i for i, label in enumerate(ner_tags)}
38
+ print(f"NER mapping (ner2id): {ner2id}")
39
+ ```
40
+
41
+ ---
42
+
43
+ ### 3. Preprocessing function
44
+ This is the core function. It takes a single data example and does two things:
45
+ 1. Tokenizes the text.
46
+ 2. Aligns character-based entity spans (`start`, `end`) with the new wordpiece tokens, assigning the correct BIO tag ID to each token.
47
+
48
+ ```python
49
+ def preprocess_function(examples):
50
+ # --- Intent Processing ---
51
+ # Convert intent strings to integer IDs
52
+ intent_ids = [intent2id[intent] for intent in examples['intent']]
53
+
54
+ # --- Tokenization ---
55
+ # Tokenize the text. `truncation=True` and `padding` are handled by the Trainer later.
56
+ tokenized_inputs = tokenizer(examples['text'], truncation=True, is_split_into_words=False, return_offsets_mapping=True)
57
+
58
+ # --- Entity (NER) Label Alignment ---
59
+ ner_labels = []
60
+ for i, entities in enumerate(examples['entities']):
61
+ word_ids = tokenized_inputs.word_ids(batch_index=i)
62
+
63
+ # Start with all tokens labeled as 'O' (Outside)
64
+ label_ids = [ner2id["O"]] * len(word_ids)
65
+
66
+ # For each entity, find the corresponding tokens and assign B- and I- tags
67
+ for entity in entities:
68
+ start_char, end_char, label = entity['start'], entity['end'], entity['label']
69
+
70
+ for j, word_id in enumerate(word_ids):
71
+ if word_id is None:
72
+ continue
73
+
74
+ # Get the character span for the current token
75
+ token_char_span = tokenized_inputs['offset_mapping'][i][j]
76
+ if token_char_span is None:
77
+ continue
78
+
79
+ token_start, token_end = token_char_span
80
+
81
+ # Check if the token is part of the entity
82
+ if start_char < token_end and end_char > token_start:
83
+ if label_ids[j] == ner2id["O"]:
84
+ # Assign the 'B-' tag to the first token
85
+ label_ids[j] = ner2id[f"B-{label}"]
86
+ else:
87
+ # Assign the 'I-' tag to subsequent tokens within the same entity
88
+ label_ids[j] = ner2id[f"I-{label}"]
89
+
90
+ ner_labels.append(label_ids)
91
+
92
+ # Add the final processed labels to our tokenized inputs
93
+ tokenized_inputs["intent_label"] = intent_ids
94
+ tokenized_inputs["labels"] = ner_labels
95
+
96
+ # Remove offset_mapping
97
+ tokenized_inputs.pop("offset_mapping", None)
98
+
99
+ return tokenized_inputs
100
+ ```
101
+
102
+ ---
103
+
104
+ ### 4. Apply Preprocessing and Save
105
+ Now we apply this function to our entire dataset and save the final, processed version.
106
+
107
+ ```python
108
+ # Apply the function to all splits of the dataset
109
+ processed_datasets = raw_datasets.map(preprocess_function, batched=True, remove_columns=raw_datasets['train'].column_names)
110
+
111
+ # Define the features for our processed dataset, including the new ClassLabels
112
+ features = Features({
113
+ 'input_ids': Sequence(Value('int64')),
114
+ 'attention_mask': Sequence(Value('int8')),
115
+ 'intent_label': ClassLabel(names=list(intent2id.keys())),
116
+ 'labels': Sequence(ClassLabel(names=list(ner2id.keys())))
117
+ })
118
+
119
+ # Cast the processed datasets to the defined features to include the label names
120
+ processed_datasets = processed_datasets.cast(features)
121
+ ```
122
+
123
+ This first four steps are essential for model training\fine-tuning.
124
+
125
+ For model inference you will need to execute the same steps on new input text.
126
+
127
+ ---
128
+
129
+ ## Multitask Model definition
130
+ To use the model you will need to define a `multitask_model.py` with the custom model class built upon our base model.
131
+
132
+ ```python
133
+ from transformers import AutoModel, PreTrainedModel
134
+ import torch.nn as nn
135
+
136
+
137
+ class MultitaskModel(PreTrainedModel):
138
+ """
139
+ A custom Transformer model with two heads: one for intent classification
140
+ and one for named entity recognition (token classification).
141
+ """
142
+ def __init__(self, config, num_intent_labels: int, num_ner_labels: int):
143
+ super().__init__(config)
144
+ self.num_intent_labels = num_intent_labels
145
+ self.num_ner_labels = num_ner_labels
146
+
147
+ # Load the base transformer model (e.g., DistilBERT)
148
+ self.transformer = AutoModel.from_config(config)
149
+
150
+ # --- Heads ---
151
+ # 1. Intent Classification Head (MLP)
152
+ self.intent_classifier = nn.Sequential(
153
+ nn.Linear(config.dim, config.dim // 2),
154
+ nn.GELU(), # GELU is a smooth activation function, common in Transformers
155
+ nn.Dropout(0.3),
156
+ nn.Linear(config.dim // 2, self.num_intent_labels)
157
+ )
158
+
159
+ # 2. NER (Token Classification) Head (MLP)
160
+ self.ner_classifier = nn.Sequential(
161
+ nn.Linear(config.dim, config.dim // 2),
162
+ nn.GELU(),
163
+ nn.Dropout(0.3),
164
+ nn.Linear(config.dim // 2, self.num_ner_labels)
165
+ )
166
+
167
+ # Dropout layer for regularization
168
+ self.dropout = nn.Dropout(config.seq_classif_dropout)
169
+
170
+ def forward(
171
+ self,
172
+ input_ids=None,
173
+ attention_mask=None,
174
+ intent_label=None, # For calculating intent loss
175
+ labels=None, # For calculating NER loss
176
+ ):
177
+ # Get the last hidden states from the base transformer model
178
+ outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
179
+ sequence_output = outputs.last_hidden_state # Shape: (batch_size, sequence_length, hidden_size)
180
+
181
+ # --- Intent Logits ---
182
+ # Use the [CLS] token's output for intent classification
183
+ cls_token_output = sequence_output[:, 0, :]
184
+ cls_token_output = self.dropout(cls_token_output)
185
+ intent_logits = self.intent_classifier(cls_token_output)
186
+
187
+ # --- NER Logits ---
188
+ # Use all token outputs for NER
189
+ sequence_output = self.dropout(sequence_output)
190
+ ner_logits = self.ner_classifier(sequence_output)
191
+
192
+ # --- Calculate Combined Loss ---
193
+ total_loss = 0
194
+ if intent_label is not None and labels is not None:
195
+ loss_fct = nn.CrossEntropyLoss()
196
+ # Intent loss
197
+ intent_loss = loss_fct(intent_logits.view(-1, self.num_intent_labels), intent_label.view(-1))
198
+ # NER loss (ignore padding tokens with label -100)
199
+ ner_loss = loss_fct(ner_logits.view(-1, self.num_ner_labels), labels.view(-1))
200
+ # Combine the losses (you can also weight them if one task is more important)
201
+ total_loss = intent_loss + ner_loss
202
+
203
+ return {
204
+ "loss": total_loss,
205
+ "intent_logits": intent_logits,
206
+ "ner_logits": ner_logits,
207
+ }
208
+ ```