andreaceto
/

schedulebot-nlu-engine

+# Intended Use
+## Data Preparation
+### 1. Load the datasets
+Load `train.jsonl`, `validation.jsonl`, and `test.jsonl` splits.
+```python
+raw_datasets = load_dataset("andreaceto/hasd")
+```
+---
+### 2. Create Label Mappings
+Now, we create the mappings from string labels (e.g., "schedule", "practitioner_name") to integer IDs. This is essential for training. We also need to create tags for the BIO (Beginning, Inside, Outside) entity scheme.
+```python
+# --- Create Intent Label Mappings ---
+# Get all unique intent labels from the training data
+intent_labels = raw_datasets['train'].unique('intent')
+intent_labels.sort() # Sort for consistency
+id2intent = {i: label for i, label in enumerate(intent_labels)}
+intent2id = {label: i for i, label in enumerate(intent_labels)}
+print(f"Intent mapping (intent2id): {intent2id}\n")
+# --- Create Entity (NER) Label Mappings in BIO format ---
+# Get all unique entity labels
+entity_labels = ["appointment_id", "appointment_type", "practitioner_name"]
+# Create the full list of BIO tags
+ner_tags = ["O"] # 'O' for tokens outside any entity
+for label in entity_labels:
+    ner_tags.append(f"B-{label}") # 'B' for Beginning of an entity
+    ner_tags.append(f"I-{label}") # 'I' for Inside of an entity
+id2ner = {i: label for i, label in enumerate(ner_tags)}
+ner2id = {label: i for i, label in enumerate(ner_tags)}
+print(f"NER mapping (ner2id): {ner2id}")
+```
+---
+### 3. Preprocessing function
+This is the core function. It takes a single data example and does two things:
+1. Tokenizes the text.
+2. Aligns character-based entity spans (`start`, `end`) with the new wordpiece tokens, assigning the correct BIO tag ID to each token.
+```python
+def preprocess_function(examples):
+    # --- Intent Processing ---
+    # Convert intent strings to integer IDs
+    intent_ids = [intent2id[intent] for intent in examples['intent']]
+    # --- Tokenization ---
+    # Tokenize the text. `truncation=True` and `padding` are handled by the Trainer later.
+    tokenized_inputs = tokenizer(examples['text'], truncation=True, is_split_into_words=False, return_offsets_mapping=True)
+    # --- Entity (NER) Label Alignment ---
+    ner_labels = []
+    for i, entities in enumerate(examples['entities']):
+        word_ids = tokenized_inputs.word_ids(batch_index=i)
+        # Start with all tokens labeled as 'O' (Outside)
+        label_ids = [ner2id["O"]] * len(word_ids)
+        # For each entity, find the corresponding tokens and assign B- and I- tags
+        for entity in entities:
+            start_char, end_char, label = entity['start'], entity['end'], entity['label']
+            for j, word_id in enumerate(word_ids):
+                if word_id is None:
+                    continue
+                # Get the character span for the current token
+                token_char_span = tokenized_inputs['offset_mapping'][i][j]
+                if token_char_span is None:
+                    continue
+                token_start, token_end = token_char_span
+                # Check if the token is part of the entity
+                if start_char < token_end and end_char > token_start:
+                    if label_ids[j] == ner2id["O"]:
+                        # Assign the 'B-' tag to the first token
+                        label_ids[j] = ner2id[f"B-{label}"]
+                    else:
+                        # Assign the 'I-' tag to subsequent tokens within the same entity
+                        label_ids[j] = ner2id[f"I-{label}"]
+        ner_labels.append(label_ids)
+    # Add the final processed labels to our tokenized inputs
+    tokenized_inputs["intent_label"] = intent_ids
+    tokenized_inputs["labels"] = ner_labels
+	# Remove offset_mapping
+    tokenized_inputs.pop("offset_mapping", None)
+    return tokenized_inputs
+```
+---
+### 4. Apply Preprocessing and Save
+Now we apply this function to our entire dataset and save the final, processed version.
+```python
+# Apply the function to all splits of the dataset
+processed_datasets = raw_datasets.map(preprocess_function, batched=True, remove_columns=raw_datasets['train'].column_names)
+# Define the features for our processed dataset, including the new ClassLabels
+features = Features({
+    'input_ids': Sequence(Value('int64')),
+    'attention_mask': Sequence(Value('int8')),
+    'intent_label': ClassLabel(names=list(intent2id.keys())),
+    'labels': Sequence(ClassLabel(names=list(ner2id.keys())))
+})
+# Cast the processed datasets to the defined features to include the label names
+processed_datasets = processed_datasets.cast(features)
+```
+This first four steps are essential for model training\fine-tuning.
+For model inference you will need to execute the same steps on new input text.
+---
+## Multitask Model definition
+To use the model you will need to define a `multitask_model.py` with the custom model class built upon our base model.
+```python
+from transformers import AutoModel, PreTrainedModel
+import torch.nn as nn
+class MultitaskModel(PreTrainedModel):
+    """
+    A custom Transformer model with two heads: one for intent classification
+    and one for named entity recognition (token classification).
+    """
+    def __init__(self, config, num_intent_labels: int, num_ner_labels: int):
+        super().__init__(config)
+        self.num_intent_labels = num_intent_labels
+        self.num_ner_labels = num_ner_labels
+        # Load the base transformer model (e.g., DistilBERT)
+        self.transformer = AutoModel.from_config(config)
+        # --- Heads ---
+        # 1. Intent Classification Head (MLP)
+        self.intent_classifier = nn.Sequential(
+            nn.Linear(config.dim, config.dim // 2),
+            nn.GELU(), # GELU is a smooth activation function, common in Transformers
+            nn.Dropout(0.3),
+            nn.Linear(config.dim // 2, self.num_intent_labels)
+        )
+        # 2. NER (Token Classification) Head (MLP)
+        self.ner_classifier = nn.Sequential(
+            nn.Linear(config.dim, config.dim // 2),
+            nn.GELU(),
+            nn.Dropout(0.3),
+            nn.Linear(config.dim // 2, self.num_ner_labels)
+        )
+        # Dropout layer for regularization
+        self.dropout = nn.Dropout(config.seq_classif_dropout)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        intent_label=None, # For calculating intent loss
+        labels=None,    # For calculating NER loss
+    ):
+        # Get the last hidden states from the base transformer model
+        outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
+        sequence_output = outputs.last_hidden_state # Shape: (batch_size, sequence_length, hidden_size)
+        # --- Intent Logits ---
+        # Use the [CLS] token's output for intent classification
+        cls_token_output = sequence_output[:, 0, :]
+        cls_token_output = self.dropout(cls_token_output)
+        intent_logits = self.intent_classifier(cls_token_output)
+        # --- NER Logits ---
+        # Use all token outputs for NER
+        sequence_output = self.dropout(sequence_output)
+        ner_logits = self.ner_classifier(sequence_output)
+        # --- Calculate Combined Loss ---
+        total_loss = 0
+        if intent_label is not None and labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            # Intent loss
+            intent_loss = loss_fct(intent_logits.view(-1, self.num_intent_labels), intent_label.view(-1))
+            # NER loss (ignore padding tokens with label -100)
+            ner_loss = loss_fct(ner_logits.view(-1, self.num_ner_labels), labels.view(-1))
+            # Combine the losses (you can also weight them if one task is more important)
+            total_loss = intent_loss + ner_loss
+        return {
+            "loss": total_loss,
+            "intent_logits": intent_logits,
+            "ner_logits": ner_logits,
+        }
+```