Update how_to_use.md

2dc819c verified 7 months ago

10.4 kB

	# Intended Use

	## Data Preparation

	### 1. Load the datasets
	Load `train.jsonl`, `validation.jsonl`, and `test.jsonl` splits.

	```python
	raw_datasets = load_dataset("andreaceto/hasd")
	```

	---

	### 2. Create Label Mappings
	Now, we create the mappings from string labels (e.g., "schedule", "practitioner_name") to integer IDs. This is essential for training. We also need to create tags for the BIO (Beginning, Inside, Outside) entity scheme.

	```python
	# --- Create Intent Label Mappings ---
	# Get all unique intent labels from the training data
	intent_labels = raw_datasets['train'].unique('intent')
	intent_labels.sort() # Sort for consistency
	id2intent = {i: label for i, label in enumerate(intent_labels)}
	intent2id = {label: i for i, label in enumerate(intent_labels)}
	print(f"Intent mapping (intent2id): {intent2id}\n")


	# --- Create Entity (NER) Label Mappings in BIO format ---
	# Get all unique entity labels
	entity_labels = ["appointment_id", "appointment_type", "practitioner_name"]
	# Create the full list of BIO tags
	ner_tags = ["O"] # 'O' for tokens outside any entity
	for label in entity_labels:
	ner_tags.append(f"B-{label}") # 'B' for Beginning of an entity
	ner_tags.append(f"I-{label}") # 'I' for Inside of an entity

	id2ner = {i: label for i, label in enumerate(ner_tags)}
	ner2id = {label: i for i, label in enumerate(ner_tags)}
	print(f"NER mapping (ner2id): {ner2id}")
	```

	---

	### 3. Preprocessing function
	This is the core function. It takes a single data example and does two things:
	1. Tokenizes the text.
	2. Aligns character-based entity spans (`start`, `end`) with the new wordpiece tokens, assigning the correct BIO tag ID to each token.

	```python
	def preprocess_function(examples):
	# --- Intent Processing ---
	# Convert intent strings to integer IDs
	intent_ids = [intent2id[intent] for intent in examples['intent']]

	# --- Tokenization ---
	# Tokenize the text. `truncation=True` and `padding` are handled by the Trainer later.
	tokenized_inputs = tokenizer(examples['text'], truncation=True, is_split_into_words=False, return_offsets_mapping=True)

	# --- Entity (NER) Label Alignment ---
	ner_labels = []
	for i, entities in enumerate(examples['entities']):
	word_ids = tokenized_inputs.word_ids(batch_index=i)

	# Start with all tokens labeled as 'O' (Outside)
	label_ids = [ner2id["O"]] * len(word_ids)

	# For each entity, find the corresponding tokens and assign B- and I- tags
	for entity in entities:
	start_char, end_char, label = entity['start'], entity['end'], entity['label']

	for j, word_id in enumerate(word_ids):
	if word_id is None:
	continue

	# Get the character span for the current token
	token_char_span = tokenized_inputs['offset_mapping'][i][j]
	if token_char_span is None:
	continue

	token_start, token_end = token_char_span

	# Check if the token is part of the entity
	if start_char < token_end and end_char > token_start:
	if label_ids[j] == ner2id["O"]:
	# Assign the 'B-' tag to the first token
	label_ids[j] = ner2id[f"B-{label}"]
	else:
	# Assign the 'I-' tag to subsequent tokens within the same entity
	label_ids[j] = ner2id[f"I-{label}"]

	ner_labels.append(label_ids)

	# Add the final processed labels to our tokenized inputs
	tokenized_inputs["intent_label"] = intent_ids
	tokenized_inputs["labels"] = ner_labels

	# Remove offset_mapping
	tokenized_inputs.pop("offset_mapping", None)

	return tokenized_inputs
	```

	---

	### 4. Apply Preprocessing and Save
	Now we apply this function to our entire dataset and save the final, processed version.

	```python
	# Apply the function to all splits of the dataset
	processed_datasets = raw_datasets.map(preprocess_function, batched=True, remove_columns=raw_datasets['train'].column_names)

	# Define the features for our processed dataset, including the new ClassLabels
	features = Features({
	'input_ids': Sequence(Value('int64')),
	'attention_mask': Sequence(Value('int8')),
	'intent_label': ClassLabel(names=list(intent2id.keys())),
	'labels': Sequence(ClassLabel(names=list(ner2id.keys())))
	})

	# Cast the processed datasets to the defined features to include the label names
	processed_datasets = processed_datasets.cast(features)
	```

	This first four steps are essential for model training\fine-tuning.

	For model inference you will need to execute the same steps on new input text.

	---

	## Multitask Model

	### 1. Multitask Model class
	To use the model you will need to define a `multitask_model.py` with the custom model class built upon our base model.

	```python
	from transformers import AutoModel, PreTrainedModel
	import torch.nn as nn


	class MultitaskModel(PreTrainedModel):
	"""
	A custom Transformer model with two heads: one for intent classification
	and one for named entity recognition (token classification).
	"""
	def __init__(self, config, num_intent_labels: int, num_ner_labels: int):
	super().__init__(config)
	self.num_intent_labels = num_intent_labels
	self.num_ner_labels = num_ner_labels

	# Load the base transformer model (e.g., DistilBERT)
	self.transformer = AutoModel.from_config(config)

	# --- Heads ---
	# 1. Intent Classification Head (MLP)
	self.intent_classifier = nn.Sequential(
	nn.Linear(config.dim, config.dim // 2),
	nn.GELU(), # GELU is a smooth activation function, common in Transformers
	nn.Dropout(0.3),
	nn.Linear(config.dim // 2, self.num_intent_labels)
	)

	# 2. NER (Token Classification) Head (MLP)
	self.ner_classifier = nn.Sequential(
	nn.Linear(config.dim, config.dim // 2),
	nn.GELU(),
	nn.Dropout(0.3),
	nn.Linear(config.dim // 2, self.num_ner_labels)
	)

	# Dropout layer for regularization
	self.dropout = nn.Dropout(config.seq_classif_dropout)

	def forward(
	self,
	input_ids=None,
	attention_mask=None,
	intent_label=None, # For calculating intent loss
	labels=None, # For calculating NER loss
	):
	# Get the last hidden states from the base transformer model
	outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
	sequence_output = outputs.last_hidden_state # Shape: (batch_size, sequence_length, hidden_size)

	# --- Intent Logits ---
	# Use the [CLS] token's output for intent classification
	cls_token_output = sequence_output[:, 0, :]
	cls_token_output = self.dropout(cls_token_output)
	intent_logits = self.intent_classifier(cls_token_output)

	# --- NER Logits ---
	# Use all token outputs for NER
	sequence_output = self.dropout(sequence_output)
	ner_logits = self.ner_classifier(sequence_output)

	# --- Calculate Combined Loss ---
	total_loss = 0
	if intent_label is not None and labels is not None:
	loss_fct = nn.CrossEntropyLoss()
	# Intent loss
	intent_loss = loss_fct(intent_logits.view(-1, self.num_intent_labels), intent_label.view(-1))
	# NER loss (ignore padding tokens with label -100)
	ner_loss = loss_fct(ner_logits.view(-1, self.num_ner_labels), labels.view(-1))
	# Combine the losses (you can also weight them if one task is more important)
	total_loss = intent_loss + ner_loss

	return {
	"loss": total_loss,
	"intent_logits": intent_logits,
	"ner_logits": ner_logits,
	}
	```

	---

	### 2. Load Tokenizer

	```
	model_name = "distilbert-base-uncased"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	```

	---

	### 3. Custom Metrics Function
	This function is essential for a multitask model. It will be called by the Trainer at the end of each epoch to calculate both intent accuracy and NER F1-score.

	```python
	def compute_metrics(eval_pred):
	# Unpack predictions and labels
	predictions, label_values = eval_pred
	intent_preds, ner_preds = predictions
	intent_labels, ner_labels = label_values

	# --- Intent Metrics ---
	intent_preds = np.argmax(intent_preds, axis=1)
	intent_accuracy = accuracy_score(intent_labels, intent_preds)
	intent_f1 = f1_score(intent_labels, intent_preds, average='weighted')

	# --- NER Metrics ---
	ner_preds = np.argmax(ner_preds, axis=2)

	# Remove padding tokens (where label is -100) and convert IDs to labels
	true_ner_labels = []
	true_ner_predictions = []
	id2ner = processed_datasets['train'].features['labels'].feature.names

	for i in range(len(ner_labels)):
	true_labels_row = []
	true_predictions_row = []
	for j in range(len(ner_labels[i])):
	if ner_labels[i][j] != -100:
	true_labels_row.append(id2ner[ner_labels[i][j]])
	true_predictions_row.append(id2ner[ner_preds[i][j]])
	true_ner_labels.append(true_labels_row)
	true_ner_predictions.append(true_predictions_row)

	ner_f1 = ner_f1_score(true_ner_labels, true_ner_predictions, mode='strict', scheme=IOB2)

	return {
	"intent_accuracy": intent_accuracy,
	"intent_f1": intent_f1,
	"ner_f1": ner_f1
	}
	```

	---

	### 4. Instantiate the model
	We now create an instance of our `MultitaskModel`, passing it a configuration object that includes the number of labels for each head.

	```
	# Get label mappings from the dataset features
	id2intent = processed_datasets['train'].features['intent_label'].names
	intent2id = {name: i for i, name in enumerate(id2intent)}
	id2ner = processed_datasets['train'].features['labels'].feature.names
	ner2id = {name: i for i, name in enumerate(id2ner)}

	# Load the model config and add our custom parameters
	config = AutoConfig.from_pretrained(
	model_name,
	id2label_intent=id2intent,
	label2id_intent=intent2id,
	id2label_ner=id2ner,
	label2id_ner=ner2id
	)

	# Instantiate our custom model with the new config
	model = MultitaskModel(config, num_intent_labels=len(id2intent), num_ner_labels=len(id2ner))
	```