Update how_to_use.md

29fe96a verified 7 months ago

12 kB

	# Training/Fine-tuning

	## Data Preparation

	### 1. Load the datasets
	Load `train.jsonl`, `validation.jsonl`, and `test.jsonl` splits.

	```python
	raw_datasets = load_dataset("andreaceto/hasd")
	```

	---

	### 2. Create Label Mappings
	Now, we create the mappings from string labels (e.g., "schedule", "practitioner_name") to integer IDs. This is essential for training. We also need to create tags for the BIO (Beginning, Inside, Outside) entity scheme.

	```python
	# --- Create Intent Label Mappings ---
	# Get all unique intent labels from the training data
	intent_labels = raw_datasets['train'].unique('intent')
	intent_labels.sort() # Sort for consistency
	id2intent = {i: label for i, label in enumerate(intent_labels)}
	intent2id = {label: i for i, label in enumerate(intent_labels)}
	print(f"Intent mapping (intent2id): {intent2id}\n")


	# --- Create Entity (NER) Label Mappings in BIO format ---
	# Get all unique entity labels
	entity_labels = ["appointment_id", "appointment_type", "practitioner_name"]
	# Create the full list of BIO tags
	ner_tags = ["O"] # 'O' for tokens outside any entity
	for label in entity_labels:
	ner_tags.append(f"B-{label}") # 'B' for Beginning of an entity
	ner_tags.append(f"I-{label}") # 'I' for Inside of an entity

	id2ner = {i: label for i, label in enumerate(ner_tags)}
	ner2id = {label: i for i, label in enumerate(ner_tags)}
	print(f"NER mapping (ner2id): {ner2id}")
	```

	---

	### 3. Preprocessing function
	This is the core function. It takes a single data example and does two things:
	1. Tokenizes the text.
	2. Aligns character-based entity spans (`start`, `end`) with the new wordpiece tokens, assigning the correct BIO tag ID to each token.

	```python
	def preprocess_function(examples):
	# --- Intent Processing ---
	intent_ids = [intent2id[intent] for intent in examples['intent']]

	# --- Tokenization ---
	tokenized_inputs = tokenizer(examples['text'], truncation=True, is_split_into_words=False, return_offsets_mapping=True)

	# --- Entity (NER) Label Alignment ---
	ner_labels = []
	for i, entities in enumerate(examples['entities']):
	word_ids = tokenized_inputs.word_ids(batch_index=i)
	label_ids = [ner2id["O"]] * len(word_ids)

	# For each entity, find the corresponding tokens and assign B- and I- tags
	for entity in entities:
	start_char, end_char, label = entity['start'], entity['end'], entity['label']

	# This flag tracks if we've found the first token of the current entity
	first_token_of_entity_found = False

	for j, word_id in enumerate(word_ids):
	if word_id is None:
	continue

	token_char_span = tokenized_inputs['offset_mapping'][i][j]
	token_start, token_end = token_char_span

	# Check if the token is part of the entity
	if start_char < token_end and end_char > token_start:
	# This is the key change. We use the flag to decide the tag.
	if not first_token_of_entity_found:
	# This is the first token of the entity, assign the 'B-' tag
	label_ids[j] = ner2id[f"B-{label}"]
	first_token_of_entity_found = True
	else:
	# This is a subsequent token of the same entity, assign 'I-'
	label_ids[j] = ner2id[f"I-{label}"]

	ner_labels.append(label_ids)

	# Add the final processed labels to our tokenized inputs
	tokenized_inputs["intent_label"] = intent_ids
	tokenized_inputs["labels"] = ner_labels

	# Remove offset_mapping as it's not needed by the model
	tokenized_inputs.pop("offset_mapping", None)

	return tokenized_inputs
	```

	---

	### 4. Apply Preprocessing and Save
	Now we apply this function to our entire dataset and save the final, processed version.

	```python
	# Apply the function to all splits of the dataset
	processed_datasets = raw_datasets.map(preprocess_function, batched=True, remove_columns=raw_datasets['train'].column_names)

	# Define the features for our processed dataset, including the new ClassLabels
	features = Features({
	'input_ids': Sequence(Value('int64')),
	'attention_mask': Sequence(Value('int8')),
	'intent_label': ClassLabel(names=list(intent2id.keys())),
	'labels': Sequence(ClassLabel(names=list(ner2id.keys())))
	})

	# Cast the processed datasets to the defined features to include the label names
	processed_datasets = processed_datasets.cast(features)
	```

	---

	## Multitask Model

	### 1. Multitask Model class
	To use the model you will need to define a `multitask_model.py` with the custom model class built upon our base model.

	```python
	from transformers import AutoModel, PreTrainedModel
	import torch.nn as nn

	class MultitaskModel(PreTrainedModel):
	"""
	A custom Transformer model with two heads: one for intent classification
	and one for named entity recognition (token classification).
	"""
	config_class = AutoConfig

	def __init__(self, config):
	super().__init__(config)

	# Load the base transformer model (e.g., DistilBERT)
	self.transformer = AutoModel.from_config(config)

	# --- Heads ---
	# 1. Intent Classification Head (MLP)
	self.intent_classifier = nn.Sequential(
	nn.Linear(config.dim, config.dim // 2),
	nn.GELU(), # GELU is a smooth activation function, common in Transformers
	nn.Dropout(0.3),
	nn.Linear(config.dim // 2, self.config.num_intent_labels)
	)

	# 2. NER (Token Classification) Head (MLP)
	self.ner_classifier = nn.Sequential(
	nn.Linear(config.dim, config.dim // 2),
	nn.GELU(),
	nn.Dropout(0.3),
	nn.Linear(config.dim // 2, self.config.num_ner_labels)
	)

	# Dropout layer for regularization
	self.dropout = nn.Dropout(config.seq_classif_dropout)

	def forward(
	self,
	input_ids=None,
	attention_mask=None,
	intent_label=None, # For calculating intent loss
	labels=None, # For calculating NER loss
	):
	# Get the last hidden states from the base transformer model
	outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
	sequence_output = outputs.last_hidden_state # Shape: (batch_size, sequence_length, hidden_size)

	# --- Intent Logits ---
	# Use the [CLS] token's output for intent classification
	cls_token_output = sequence_output[:, 0, :]
	cls_token_output = self.dropout(cls_token_output)
	intent_logits = self.intent_classifier(cls_token_output)

	# --- NER Logits ---
	# Use all token outputs for NER
	sequence_output = self.dropout(sequence_output)
	ner_logits = self.ner_classifier(sequence_output)

	# --- Calculate Combined Loss ---
	total_loss = 0
	if intent_label is not None and labels is not None:
	loss_fct = nn.CrossEntropyLoss()
	# Intent loss
	intent_loss = loss_fct(intent_logits.view(-1, self.config.num_intent_labels), intent_label.view(-1))
	# NER loss (ignore padding tokens with label -100)
	ner_loss = loss_fct(ner_logits.view(-1, self.config.num_ner_labels), labels.view(-1))
	# Combine the losses (you can also weight them if one task is more important)
	total_loss = intent_loss + ner_loss

	return {
	"loss": total_loss,
	"intent_logits": intent_logits,
	"ner_logits": ner_logits,
	}
	```

	---

	### 2. Load Tokenizer

	```
	model_name = "distilbert-base-uncased"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	```

	---

	### 3. Custom Metrics Function
	This function is essential for a multitask model. It will be called by the Trainer at the end of each epoch to calculate both intent accuracy and NER F1-score.

	```python
	def compute_metrics(eval_pred):
	# Unpack predictions and labels
	predictions, label_values = eval_pred
	intent_preds, ner_preds = predictions
	intent_labels, ner_labels = label_values

	# --- Intent Metrics ---
	intent_preds = np.argmax(intent_preds, axis=1)
	intent_accuracy = accuracy_score(intent_labels, intent_preds)
	intent_f1 = f1_score(intent_labels, intent_preds, average='weighted')

	# --- NER Metrics ---
	ner_preds = np.argmax(ner_preds, axis=2)

	# Remove padding tokens (where label is -100) and convert IDs to labels
	true_ner_labels = []
	true_ner_predictions = []
	id2ner = processed_datasets['train'].features['labels'].feature.names

	for i in range(len(ner_labels)):
	true_labels_row = []
	true_predictions_row = []
	for j in range(len(ner_labels[i])):
	if ner_labels[i][j] != -100:
	true_labels_row.append(id2ner[ner_labels[i][j]])
	true_predictions_row.append(id2ner[ner_preds[i][j]])
	true_ner_labels.append(true_labels_row)
	true_ner_predictions.append(true_predictions_row)

	ner_f1 = ner_f1_score(true_ner_labels, true_ner_predictions, mode='strict', scheme=IOB2)

	return {
	"intent_accuracy": intent_accuracy,
	"intent_f1": intent_f1,
	"ner_f1": ner_f1
	}
	```

	---

	### 4. Instantiate the model
	We now create an instance of our `MultitaskModel`, passing it a configuration object that includes the number of labels for each head.

	```
	# --- Get DistilBert config file ---
	config = AutoConfig.from_pretrained(model_name)


	# --- Create Label Mappings Directly from Data ---
	# 1. Intent Labels
	# Get a sorted list of unique intent strings from the training set
	intent_label_list = processed_datasets['train'].features['intent_label'].names
	# Create the mappings
	id2intent = {i: label for i, label in enumerate(intent_label_list)}
	intent2id = {label: i for i, label in enumerate(intent_label_list)}

	# 2. NER Labels
	# Get a sorted list of unique entities strings from the training set
	ner_label_list = processed_datasets['train'].features['labels'].feature.names
	# Create the mappings
	id2ner = {i: label for i, label in enumerate(ner_label_list)}
	ner2id = {label: i for i, label in enumerate(ner_label_list)}

	# --- Add custom parameters to config object ---
	config.num_intent_labels = len(id2intent)
	config.num_ner_labels = len(id2ner)
	config.id2label_intent = id2intent
	config.label2id_intent = intent2id
	config.id2label_ner = id2ner
	config.label2id_ner = ner2id

	# --- Finally instantiate the model ---
	model = MultitaskModel(config)
	```

	---

	# Inference

	## Load the Model
	Load the trained tokenizer and model:
	```
	tokenizer = AutoTokenizer.from_pretrained("andreaceto/schedulebot-nlu-engine")
	multitask_model = MultitaskModel.from_pretrained("andreaceto/schedulebot-nlu-engine")
	```

	---

	## Preprocess raw text
	In order to being able to run inference on the model, it's necessary to use the tokenizer on the raw text in input:

	```
	inputs = self.tokenizer(text, return_tensors="pt")
	```

	---

	## Get predictions
	Now you can run inference using:

	```
	with torch.no_grad():
	outputs = self.multitask_model(**inputs)
	```

	Since the model returns a dictionary, you can access logits by using:

	```
	intent_logits = outputs["intent_logits"]
	ner_logits = outputs["ner_logits"]
	```

	You can now extract prediction using `torch.argmax()` and converting the result into categorical by using the dictionaries in the config file (`id2label_intent` and `id2label_ner`).

	N.B.: Since SON format specifications require that all keys in an object be strings. When the transformers library saves your configuration, it correctly converts your integer keys (0, 1, 2, etc.) into strings ("0", "1", "2"), which causes a mismatch when trying to use extracted prediction with `torch.argmax()` as a key in the config dictionaries. The solution is to cast the `int` value of prediction as a string with `str()`.