| # Intended Use | |
| ## Data Preparation | |
| ### 1. Load the datasets | |
| Load `train.jsonl`, `validation.jsonl`, and `test.jsonl` splits. | |
| ```python | |
| raw_datasets = load_dataset("andreaceto/hasd") | |
| ``` | |
| --- | |
| ### 2. Create Label Mappings | |
| Now, we create the mappings from string labels (e.g., "schedule", "practitioner_name") to integer IDs. This is essential for training. We also need to create tags for the BIO (Beginning, Inside, Outside) entity scheme. | |
| ```python | |
| # --- Create Intent Label Mappings --- | |
| # Get all unique intent labels from the training data | |
| intent_labels = raw_datasets['train'].unique('intent') | |
| intent_labels.sort() # Sort for consistency | |
| id2intent = {i: label for i, label in enumerate(intent_labels)} | |
| intent2id = {label: i for i, label in enumerate(intent_labels)} | |
| print(f"Intent mapping (intent2id): {intent2id}\n") | |
| # --- Create Entity (NER) Label Mappings in BIO format --- | |
| # Get all unique entity labels | |
| entity_labels = ["appointment_id", "appointment_type", "practitioner_name"] | |
| # Create the full list of BIO tags | |
| ner_tags = ["O"] # 'O' for tokens outside any entity | |
| for label in entity_labels: | |
| ner_tags.append(f"B-{label}") # 'B' for Beginning of an entity | |
| ner_tags.append(f"I-{label}") # 'I' for Inside of an entity | |
| id2ner = {i: label for i, label in enumerate(ner_tags)} | |
| ner2id = {label: i for i, label in enumerate(ner_tags)} | |
| print(f"NER mapping (ner2id): {ner2id}") | |
| ``` | |
| --- | |
| ### 3. Preprocessing function | |
| This is the core function. It takes a single data example and does two things: | |
| 1. Tokenizes the text. | |
| 2. Aligns character-based entity spans (`start`, `end`) with the new wordpiece tokens, assigning the correct BIO tag ID to each token. | |
| ```python | |
| def preprocess_function(examples): | |
| # --- Intent Processing --- | |
| # Convert intent strings to integer IDs | |
| intent_ids = [intent2id[intent] for intent in examples['intent']] | |
| # --- Tokenization --- | |
| # Tokenize the text. `truncation=True` and `padding` are handled by the Trainer later. | |
| tokenized_inputs = tokenizer(examples['text'], truncation=True, is_split_into_words=False, return_offsets_mapping=True) | |
| # --- Entity (NER) Label Alignment --- | |
| ner_labels = [] | |
| for i, entities in enumerate(examples['entities']): | |
| word_ids = tokenized_inputs.word_ids(batch_index=i) | |
| # Start with all tokens labeled as 'O' (Outside) | |
| label_ids = [ner2id["O"]] * len(word_ids) | |
| # For each entity, find the corresponding tokens and assign B- and I- tags | |
| for entity in entities: | |
| start_char, end_char, label = entity['start'], entity['end'], entity['label'] | |
| for j, word_id in enumerate(word_ids): | |
| if word_id is None: | |
| continue | |
| # Get the character span for the current token | |
| token_char_span = tokenized_inputs['offset_mapping'][i][j] | |
| if token_char_span is None: | |
| continue | |
| token_start, token_end = token_char_span | |
| # Check if the token is part of the entity | |
| if start_char < token_end and end_char > token_start: | |
| if label_ids[j] == ner2id["O"]: | |
| # Assign the 'B-' tag to the first token | |
| label_ids[j] = ner2id[f"B-{label}"] | |
| else: | |
| # Assign the 'I-' tag to subsequent tokens within the same entity | |
| label_ids[j] = ner2id[f"I-{label}"] | |
| ner_labels.append(label_ids) | |
| # Add the final processed labels to our tokenized inputs | |
| tokenized_inputs["intent_label"] = intent_ids | |
| tokenized_inputs["labels"] = ner_labels | |
| # Remove offset_mapping | |
| tokenized_inputs.pop("offset_mapping", None) | |
| return tokenized_inputs | |
| ``` | |
| --- | |
| ### 4. Apply Preprocessing and Save | |
| Now we apply this function to our entire dataset and save the final, processed version. | |
| ```python | |
| # Apply the function to all splits of the dataset | |
| processed_datasets = raw_datasets.map(preprocess_function, batched=True, remove_columns=raw_datasets['train'].column_names) | |
| # Define the features for our processed dataset, including the new ClassLabels | |
| features = Features({ | |
| 'input_ids': Sequence(Value('int64')), | |
| 'attention_mask': Sequence(Value('int8')), | |
| 'intent_label': ClassLabel(names=list(intent2id.keys())), | |
| 'labels': Sequence(ClassLabel(names=list(ner2id.keys()))) | |
| }) | |
| # Cast the processed datasets to the defined features to include the label names | |
| processed_datasets = processed_datasets.cast(features) | |
| ``` | |
| This first four steps are essential for model training\fine-tuning. | |
| For model inference you will need to execute the same steps on new input text. | |
| --- | |
| ## Multitask Model | |
| ### 1. Multitask Model class | |
| To use the model you will need to define a `multitask_model.py` with the custom model class built upon our base model. | |
| ```python | |
| from transformers import AutoModel, PreTrainedModel | |
| import torch.nn as nn | |
| class MultitaskModel(PreTrainedModel): | |
| """ | |
| A custom Transformer model with two heads: one for intent classification | |
| and one for named entity recognition (token classification). | |
| """ | |
| def __init__(self, config, num_intent_labels: int, num_ner_labels: int): | |
| super().__init__(config) | |
| self.num_intent_labels = num_intent_labels | |
| self.num_ner_labels = num_ner_labels | |
| # Load the base transformer model (e.g., DistilBERT) | |
| self.transformer = AutoModel.from_config(config) | |
| # --- Heads --- | |
| # 1. Intent Classification Head (MLP) | |
| self.intent_classifier = nn.Sequential( | |
| nn.Linear(config.dim, config.dim // 2), | |
| nn.GELU(), # GELU is a smooth activation function, common in Transformers | |
| nn.Dropout(0.3), | |
| nn.Linear(config.dim // 2, self.num_intent_labels) | |
| ) | |
| # 2. NER (Token Classification) Head (MLP) | |
| self.ner_classifier = nn.Sequential( | |
| nn.Linear(config.dim, config.dim // 2), | |
| nn.GELU(), | |
| nn.Dropout(0.3), | |
| nn.Linear(config.dim // 2, self.num_ner_labels) | |
| ) | |
| # Dropout layer for regularization | |
| self.dropout = nn.Dropout(config.seq_classif_dropout) | |
| def forward( | |
| self, | |
| input_ids=None, | |
| attention_mask=None, | |
| intent_label=None, # For calculating intent loss | |
| labels=None, # For calculating NER loss | |
| ): | |
| # Get the last hidden states from the base transformer model | |
| outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask) | |
| sequence_output = outputs.last_hidden_state # Shape: (batch_size, sequence_length, hidden_size) | |
| # --- Intent Logits --- | |
| # Use the [CLS] token's output for intent classification | |
| cls_token_output = sequence_output[:, 0, :] | |
| cls_token_output = self.dropout(cls_token_output) | |
| intent_logits = self.intent_classifier(cls_token_output) | |
| # --- NER Logits --- | |
| # Use all token outputs for NER | |
| sequence_output = self.dropout(sequence_output) | |
| ner_logits = self.ner_classifier(sequence_output) | |
| # --- Calculate Combined Loss --- | |
| total_loss = 0 | |
| if intent_label is not None and labels is not None: | |
| loss_fct = nn.CrossEntropyLoss() | |
| # Intent loss | |
| intent_loss = loss_fct(intent_logits.view(-1, self.num_intent_labels), intent_label.view(-1)) | |
| # NER loss (ignore padding tokens with label -100) | |
| ner_loss = loss_fct(ner_logits.view(-1, self.num_ner_labels), labels.view(-1)) | |
| # Combine the losses (you can also weight them if one task is more important) | |
| total_loss = intent_loss + ner_loss | |
| return { | |
| "loss": total_loss, | |
| "intent_logits": intent_logits, | |
| "ner_logits": ner_logits, | |
| } | |
| ``` | |
| --- | |
| ### 2. Load Tokenizer | |
| ``` | |
| model_name = "distilbert-base-uncased" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| ``` | |
| --- | |
| ### 3. Custom Metrics Function | |
| This function is essential for a multitask model. It will be called by the Trainer at the end of each epoch to calculate both intent accuracy and NER F1-score. | |
| ```python | |
| def compute_metrics(eval_pred): | |
| # Unpack predictions and labels | |
| predictions, label_values = eval_pred | |
| intent_preds, ner_preds = predictions | |
| intent_labels, ner_labels = label_values | |
| # --- Intent Metrics --- | |
| intent_preds = np.argmax(intent_preds, axis=1) | |
| intent_accuracy = accuracy_score(intent_labels, intent_preds) | |
| intent_f1 = f1_score(intent_labels, intent_preds, average='weighted') | |
| # --- NER Metrics --- | |
| ner_preds = np.argmax(ner_preds, axis=2) | |
| # Remove padding tokens (where label is -100) and convert IDs to labels | |
| true_ner_labels = [] | |
| true_ner_predictions = [] | |
| id2ner = processed_datasets['train'].features['labels'].feature.names | |
| for i in range(len(ner_labels)): | |
| true_labels_row = [] | |
| true_predictions_row = [] | |
| for j in range(len(ner_labels[i])): | |
| if ner_labels[i][j] != -100: | |
| true_labels_row.append(id2ner[ner_labels[i][j]]) | |
| true_predictions_row.append(id2ner[ner_preds[i][j]]) | |
| true_ner_labels.append(true_labels_row) | |
| true_ner_predictions.append(true_predictions_row) | |
| ner_f1 = ner_f1_score(true_ner_labels, true_ner_predictions, mode='strict', scheme=IOB2) | |
| return { | |
| "intent_accuracy": intent_accuracy, | |
| "intent_f1": intent_f1, | |
| "ner_f1": ner_f1 | |
| } | |
| ``` | |
| --- | |
| ### 4. Instantiate the model | |
| We now create an instance of our `MultitaskModel`, passing it a configuration object that includes the number of labels for each head. | |
| ``` | |
| # Get label mappings from the dataset features | |
| id2intent = processed_datasets['train'].features['intent_label'].names | |
| intent2id = {name: i for i, name in enumerate(id2intent)} | |
| id2ner = processed_datasets['train'].features['labels'].feature.names | |
| ner2id = {name: i for i, name in enumerate(id2ner)} | |
| # Load the model config and add our custom parameters | |
| config = AutoConfig.from_pretrained( | |
| model_name, | |
| id2label_intent=id2intent, | |
| label2id_intent=intent2id, | |
| id2label_ner=id2ner, | |
| label2id_ner=ner2id | |
| ) | |
| # Instantiate our custom model with the new config | |
| model = MultitaskModel(config, num_intent_labels=len(id2intent), num_ner_labels=len(id2ner)) | |
| ``` | |