{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Named Entity Recognition with Peft Model 🤗\n", "\n", "##### In this notebook, we will learn how to perform Named Entity Recognition(NER) on the CoNLL-2003 dataset using the Trainer class\n", "\n", "##### This notebook has been adapted from the main NLP course here - https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt#fine-tuning-the-model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#install the required libraries\n", "!pip install -q datasets evaluate transformers seqeval" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Import required libraries\n", "from datasets import load_dataset\n", "from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer, pipeline\n", "from peft import get_peft_model, LoraConfig, TaskType\n", "import evaluate\n", "import numpy as np\n", "from huggingface_hub import notebook_login" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],\n", " num_rows: 14041\n", " })\n", " validation: Dataset({\n", " features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],\n", " num_rows: 3250\n", " })\n", " test: Dataset({\n", " features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],\n", " num_rows: 3453\n", " })\n", "})\n" ] } ], "source": [ "raw_datasets = load_dataset(\"conll2003\")\n", "print(raw_datasets)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Look at the tokens of the first training example\n", "raw_datasets[\"train\"][0][\"tokens\"]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[3, 0, 7, 0, 0, 0, 7, 0, 0]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Look at the NER tags of the first training example\n", "raw_datasets[\"train\"][0][\"ner_tags\"]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Get the label names for the NER tags\n", "ner_feature = raw_datasets[\"train\"].features[\"ner_tags\"]\n", "label_names = ner_feature.feature.names\n", "label_names" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "EU rejects German call to boycott British lamb . \n", "B-ORG O B-MISC O O O B-MISC O O \n" ] } ], "source": [ "words = raw_datasets[\"train\"][0][\"tokens\"]\n", "labels = raw_datasets[\"train\"][0][\"ner_tags\"]\n", "line1 = \"\"\n", "line2 = \"\"\n", "for word, label in zip(words, labels):\n", " full_label = label_names[label]\n", " max_length = max(len(word), len(full_label))\n", " line1 += word + \" \" * (max_length - len(word) + 1)\n", " line2 += full_label + \" \" * (max_length - len(full_label) + 1)\n", "\n", "print(line1)\n", "print(line2)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# Load the tokenizer\n", "model_checkpoint = \"bert-base-cased\"\n", "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['[CLS]',\n", " 'EU',\n", " 'rejects',\n", " 'German',\n", " 'call',\n", " 'to',\n", " 'boycott',\n", " 'British',\n", " 'la',\n", " '##mb',\n", " '.',\n", " '[SEP]']" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Tokenize the first training example\n", "inputs = tokenizer(raw_datasets[\"train\"][0][\"tokens\"], is_split_into_words=True)\n", "inputs.tokens()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "def align_labels_with_tokens(labels, word_ids):\n", " new_labels = []\n", " current_word = None\n", " for word_id in word_ids:\n", " if word_id != current_word:\n", " # Start of a new word!\n", " current_word = word_id\n", " label = -100 if word_id is None else labels[word_id]\n", " new_labels.append(label)\n", " elif word_id is None:\n", " # Special token\n", " new_labels.append(-100)\n", " else:\n", " # Same word as previous token\n", " label = labels[word_id]\n", " # If the label is B-XXX we change it to I-XXX\n", " if label % 2 == 1:\n", " label += 1\n", " new_labels.append(label)\n", "\n", " return new_labels" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[3, 0, 7, 0, 0, 0, 7, 0, 0]\n", "[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]\n" ] } ], "source": [ "labels = raw_datasets[\"train\"][0][\"ner_tags\"]\n", "word_ids = inputs.word_ids()\n", "print(labels)\n", "print(align_labels_with_tokens(labels, word_ids))" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "def tokenize_and_align_labels(examples):\n", " tokenized_inputs = tokenizer(\n", " examples[\"tokens\"], truncation=True, is_split_into_words=True\n", " )\n", " all_labels = examples[\"ner_tags\"]\n", " new_labels = []\n", " for i, labels in enumerate(all_labels):\n", " word_ids = tokenized_inputs.word_ids(i)\n", " new_labels.append(align_labels_with_tokens(labels, word_ids))\n", "\n", " tokenized_inputs[\"labels\"] = new_labels\n", " return tokenized_inputs" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "tokenized_datasets = raw_datasets.map(\n", " tokenize_and_align_labels,\n", " batched=True,\n", " remove_columns=raw_datasets[\"train\"].column_names,\n", ")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]\n", "[-100, 1, 2, -100]\n" ] } ], "source": [ "for i in range(2):\n", " print(tokenized_datasets[\"train\"][i][\"labels\"])" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "metric = evaluate.load(\"seqeval\")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# Create label mappings\n", "id2label = {i: label for i, label in enumerate(label_names)}\n", "label2id = {v: k for k, v in id2label.items()}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Load the pre-trained model\n", "model = AutoModelForTokenClassification.from_pretrained(\n", " model_checkpoint,\n", " id2label=id2label,\n", " label2id=label2id,\n", ")" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "9" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.config.num_labels" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Configure LoRA (Low-Rank Adaptation) for fine-tuning\n", "peft_config = LoraConfig(target_modules = [\"query\", \"key\"], task_type = TaskType.TOKEN_CLS)\n", "\n", "model = get_peft_model(model, peft_config)\n", "model.print_trainable_parameters()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "def compute_metrics(eval_preds):\n", " logits, labels = eval_preds\n", " predictions = np.argmax(logits, axis=-1)\n", "\n", " # Remove ignored index (special tokens) and convert to labels\n", " true_labels = [[label_names[l] for l in label if l != -100] for label in labels]\n", " true_predictions = [\n", " [label_names[p] for (p, l) in zip(prediction, label) if l != -100]\n", " for prediction, label in zip(predictions, labels)\n", " ]\n", " all_metrics = metric.compute(predictions=true_predictions, references=true_labels)\n", " return {\n", " \"precision\": all_metrics[\"overall_precision\"],\n", " \"recall\": all_metrics[\"overall_recall\"],\n", " \"f1\": all_metrics[\"overall_f1\"],\n", " \"accuracy\": all_metrics[\"overall_accuracy\"],\n", " }" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "notebook_login()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "args = TrainingArguments(\n", " \"bert-finetuned-ner-lora\",\n", " eval_strategy=\"epoch\",\n", " per_device_train_batch_size=32, # decrease this for OOM error\n", " per_device_eval_batch_size=64,\n", " save_strategy=\"epoch\",\n", " learning_rate=2e-3,\n", " num_train_epochs=5,\n", " weight_decay=0.01,\n", " load_best_model_at_end=True,\n", " do_eval=True,\n", " do_predict=True,\n", " metric_for_best_model=\"accuracy\",\n", " label_names=[\"labels\"],\n", " push_to_hub=True,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "trainer = Trainer(\n", " model=model,\n", " args=args,\n", " train_dataset=tokenized_datasets[\"train\"],\n", " eval_dataset=tokenized_datasets[\"validation\"],\n", " data_collator=data_collator,\n", " processing_class=tokenizer,\n", " compute_metrics=compute_metrics\n", ")\n", "trainer.train()" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", "Device set to use xpu:0\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "entity_idx: 0, id2label: {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}\n", "entity_idx: 0, id2label: {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}\n", "entity_idx: 0, id2label: {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}\n", "entity_idx: 1, id2label: {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}\n", "entity_idx: 2, id2label: {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}\n", "entity_idx: 0, id2label: {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}\n" ] }, { "data": { "text/plain": [ "[{'entity_group': 'PER',\n", " 'score': 0.9702984,\n", " 'word': 'Jino',\n", " 'start': 11,\n", " 'end': 15}]" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from peft import PeftModel\n", "\n", "# Replace this with your own checkpoint\n", "lora_checkpoint = \"./bert-finetuned-ner-lora\"\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n", "base_model = AutoModelForTokenClassification.from_pretrained(\n", " model_checkpoint,\n", " id2label=id2label,\n", " label2id=label2id,\n", ")\n", "lora_model = PeftModel.from_pretrained(base_model, lora_checkpoint)\n", "token_classifier = pipeline(\n", " \"token-classification\", model=lora_model, tokenizer=tokenizer, aggregation_strategy=\"simple\"\n", ")\n", "\n", "token_classifier(\"My name is Jino.\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.13" } }, "nbformat": 4, "nbformat_minor": 2 }