## Named Entity Recognition with Peft Model 🤗

##### In this notebook, we will learn how to perform Named Entity Recognition(NER) on the CoNLL-2003 dataset using the Trainer class

##### This notebook has been adapted from the main NLP course here - https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt#fine-tuning-the-model

In [None]:
#install the required libraries
!pip install -q datasets evaluate transformers seqeval

In [None]:
# Import required libraries
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer, pipeline
from peft import get_peft_model, LoraConfig, TaskType
import evaluate
import numpy as np
from huggingface_hub import notebook_login

In [2]:
raw_datasets = load_dataset("conll2003")
print(raw_datasets)

DatasetDict({
 train: Dataset({
 features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
 num_rows: 14041
 })
 validation: Dataset({
 features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
 num_rows: 3250
 })
 test: Dataset({
 features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
 num_rows: 3453
 })
})


In [3]:
# Look at the tokens of the first training example
raw_datasets["train"][0]["tokens"]

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [4]:
# Look at the NER tags of the first training example
raw_datasets["train"][0]["ner_tags"]

[3, 0, 7, 0, 0, 0, 7, 0, 0]

In [5]:
# Get the label names for the NER tags
ner_feature = raw_datasets["train"].features["ner_tags"]
label_names = ner_feature.feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [6]:
words = raw_datasets["train"][0]["tokens"]
labels = raw_datasets["train"][0]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
 full_label = label_names[label]
 max_length = max(len(word), len(full_label))
 line1 += word + " " * (max_length - len(word) + 1)
 line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

EU rejects German call to boycott British lamb . 
B-ORG O B-MISC O O O B-MISC O O 


In [7]:
# Load the tokenizer
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [8]:
# Tokenize the first training example
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

In [9]:
def align_labels_with_tokens(labels, word_ids):
 new_labels = []
 current_word = None
 for word_id in word_ids:
 if word_id != current_word:
 # Start of a new word!
 current_word = word_id
 label = -100 if word_id is None else labels[word_id]
 new_labels.append(label)
 elif word_id is None:
 # Special token
 new_labels.append(-100)
 else:
 # Same word as previous token
 label = labels[word_id]
 # If the label is B-XXX we change it to I-XXX
 if label % 2 == 1:
 label += 1
 new_labels.append(label)

 return new_labels

In [10]:
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[3, 0, 7, 0, 0, 0, 7, 0, 0]
[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]


In [11]:
def tokenize_and_align_labels(examples):
 tokenized_inputs = tokenizer(
 examples["tokens"], truncation=True, is_split_into_words=True
 )
 all_labels = examples["ner_tags"]
 new_labels = []
 for i, labels in enumerate(all_labels):
 word_ids = tokenized_inputs.word_ids(i)
 new_labels.append(align_labels_with_tokens(labels, word_ids))

 tokenized_inputs["labels"] = new_labels
 return tokenized_inputs

In [12]:
tokenized_datasets = raw_datasets.map(
 tokenize_and_align_labels,
 batched=True,
 remove_columns=raw_datasets["train"].column_names,
)

In [13]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [14]:
for i in range(2):
 print(tokenized_datasets["train"][i]["labels"])

[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]
[-100, 1, 2, -100]


In [15]:
metric = evaluate.load("seqeval")

In [16]:
# Create label mappings
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
# Load the pre-trained model
model = AutoModelForTokenClassification.from_pretrained(
 model_checkpoint,
 id2label=id2label,
 label2id=label2id,
)

In [18]:
model.config.num_labels

9

In [None]:
model

In [None]:
# Configure LoRA (Low-Rank Adaptation) for fine-tuning
peft_config = LoraConfig(target_modules = ["query", "key"], task_type = TaskType.TOKEN_CLS)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

In [21]:
def compute_metrics(eval_preds):
 logits, labels = eval_preds
 predictions = np.argmax(logits, axis=-1)

 # Remove ignored index (special tokens) and convert to labels
 true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
 true_predictions = [
 [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
 for prediction, label in zip(predictions, labels)
 ]
 all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
 return {
 "precision": all_metrics["overall_precision"],
 "recall": all_metrics["overall_recall"],
 "f1": all_metrics["overall_f1"],
 "accuracy": all_metrics["overall_accuracy"],
 }

In [None]:
notebook_login()

In [None]:
args = TrainingArguments(
 "bert-finetuned-ner-lora",
 eval_strategy="epoch",
 per_device_train_batch_size=32, # decrease this for OOM error
 per_device_eval_batch_size=64,
 save_strategy="epoch",
 learning_rate=2e-3,
 num_train_epochs=5,
 weight_decay=0.01,
 load_best_model_at_end=True,
 do_eval=True,
 do_predict=True,
 metric_for_best_model="accuracy",
 label_names=["labels"],
 push_to_hub=True,
)

In [None]:
trainer = Trainer(
 model=model,
 args=args,
 train_dataset=tokenized_datasets["train"],
 eval_dataset=tokenized_datasets["validation"],
 data_collator=data_collator,
 processing_class=tokenizer,
 compute_metrics=compute_metrics
)
trainer.train()

In [31]:
from peft import PeftModel

# Replace this with your own checkpoint
lora_checkpoint = "./bert-finetuned-ner-lora"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
base_model = AutoModelForTokenClassification.from_pretrained(
 model_checkpoint,
 id2label=id2label,
 label2id=label2id,
)
lora_model = PeftModel.from_pretrained(base_model, lora_checkpoint)
token_classifier = pipeline(
 "token-classification", model=lora_model, tokenizer=tokenizer, aggregation_strategy="simple"
)

token_classifier("My name is Jino.")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use xpu:0


entity_idx: 0, id2label: {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}
entity_idx: 0, id2label: {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}
entity_idx: 0, id2label: {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}
entity_idx: 1, id2label: {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}
entity_idx: 2, id2label: {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}
entity_idx: 0, id2label: {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}


[{'entity_group': 'PER',
 'score': 0.9702984,
 'word': 'Jino',
 'start': 11,
 'end': 15}]