"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# a chart using matplotlib to show the distribution of intents in the training set\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np \n",
"intent_counts = np.bincount(train_labels)\n",
"intent_labels = [int2str(i) for i in range(len(intent_counts))] \n",
"plt.figure(figsize=(30, 6))\n",
"plt.bar(intent_labels, intent_counts, color='skyblue')\n",
"plt.xlabel('Intent')\n",
"plt.ylabel('Number of Samples')\n",
"plt.title('Distribution of Intents in Training Set')\n",
"plt.xticks(rotation=45)\n",
"plt.tight_layout()\n",
"plt.show()\n",
"\n"
]
},
{
"cell_type": "markdown",
"id": "497b4bf1",
"metadata": {},
"source": [
"# **Step 2: Fine-tune BERT**"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "ee8fcd88",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2025-07-08 21:33:04.935732: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
"2025-07-08 21:33:04.944945: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
"WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
"E0000 00:00:1751999584.956302 11434 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
"E0000 00:00:1751999584.959828 11434 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
"W0000 00:00:1751999584.968805 11434 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
"W0000 00:00:1751999584.968821 11434 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
"W0000 00:00:1751999584.968822 11434 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
"W0000 00:00:1751999584.968823 11434 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
"2025-07-08 21:33:04.971765: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
"To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
"Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
"Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
]
}
],
"source": [
"from transformers import BertTokenizer, BertForSequenceClassification, Trainer\n",
"\n",
"NUM_CLASSES = dataset[\"train\"].features[\"intent\"].num_classes\n",
"\n",
"# Load tokenizer and model\n",
"tokenizer = BertTokenizer.from_pretrained(\"bert-base-uncased\")\n",
"model = BertForSequenceClassification.from_pretrained(\"bert-base-uncased\", num_labels=NUM_CLASSES)"
]
},
{
"cell_type": "markdown",
"id": "d9952a6f",
"metadata": {},
"source": [
"### **Step 2.1: Tekonize the data**"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "92259600",
"metadata": {},
"outputs": [],
"source": [
"# tekonize the data\n",
"def tokenize_function(examples):\n",
" return tokenizer(examples[\"text\"], truncation=True, padding=\"max_length\", max_length=128) \n",
"\n",
"tokenized_datasets = dataset.map(tokenize_function, batched=True)\n"
]
},
{
"cell_type": "markdown",
"id": "1482b785",
"metadata": {},
"source": [
"### **Step 2.2: Set format for PyTorch**\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "3a8796e6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"DatasetDict({\n",
" train: Dataset({\n",
" features: ['text', 'intent', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
" num_rows: 7600\n",
" })\n",
" validation: Dataset({\n",
" features: ['text', 'intent', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
" num_rows: 3100\n",
" })\n",
" test: Dataset({\n",
" features: ['text', 'intent', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
" num_rows: 5500\n",
" })\n",
"})"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokenized_datasets.set_format(type=\"torch\", columns=[\"input_ids\", \"attention_mask\", \"intent\"])\n",
"tokenized_datasets"
]
},
{
"cell_type": "markdown",
"id": "2eae6122",
"metadata": {},
"source": [
"### **Step 2.3: Create DataLoaders**\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "9e68f14d",
"metadata": {},
"outputs": [],
"source": [
"from torch.utils.data import DataLoader\n",
"\"\"\"\n",
"DataLoaders are used to efficiently load batches of tokenized data for training and validation of the BERT model\n",
"---> train_loader provides batches of the training data to the model during training.\n",
"---> val_loader provides batches of the validation data to evaluate the model's performance during or after training.\n",
"\"\"\"\n",
"train_loader = DataLoader(tokenized_datasets[\"train\"], batch_size=16, shuffle=True) \n",
"val_loader = DataLoader(tokenized_datasets[\"validation\"], batch_size=16)\n"
]
},
{
"cell_type": "markdown",
"id": "703e816a",
"metadata": {},
"source": [
"### **Step 2.4: Train BERT (fine-tuning)**\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "870c036e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'labels': tensor(108),\n",
" 'input_ids': tensor([ 101, 2064, 2017, 3328, 2033, 2083, 4292, 2039, 3622, 10042,\n",
" 2000, 2026, 2924, 1997, 4274, 10995, 4070, 102, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0]),\n",
" 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0])}"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Tokenize and prepare dataset for Trainer\n",
"def tokenize_function(examples):\n",
" return tokenizer(examples[\"text\"], truncation=True, padding=\"max_length\", max_length=128)\n",
"\n",
"encoded_dataset = dataset.map(tokenize_function, batched=True)\n",
"encoded_dataset = encoded_dataset.rename_column(\"intent\", \"labels\")\n",
"encoded_dataset.set_format(type=\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n",
"\n",
"# Check a sample\n",
"encoded_dataset[\"train\"][0]"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "faf2683a",
"metadata": {},
"outputs": [],
"source": [
"from transformers import TrainingArguments, Trainer\n",
"import numpy as np\n",
"from sklearn.metrics import accuracy_score, f1_score\n",
"\n",
"def compute_metrics(eval_pred):\n",
" logits, labels = eval_pred\n",
" predictions = np.argmax(logits, axis=-1)\n",
" acc = accuracy_score(labels, predictions)\n",
" f1 = f1_score(labels, predictions, average='weighted')\n",
" return {\"accuracy\": acc, \"f1\": f1}\n",
"\n",
"# For older transformers versions, use eval_strategy instead of evaluation_strategy\n",
"training_args = TrainingArguments(\n",
" output_dir=\"./results\",\n",
" num_train_epochs=3,\n",
" per_device_train_batch_size=16,\n",
" per_device_eval_batch_size=16,\n",
" save_strategy=\"epoch\",\n",
" eval_strategy=\"epoch\",\n",
" logging_dir=\"./logs\",\n",
" fp16=True,\n",
" load_best_model_at_end=True,\n",
" metric_for_best_model=\"accuracy\",\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "f3468509",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_11434/311998613.py:1: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.\n",
" trainer = Trainer(\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" \n",
"
\n",
" [1425/1425 02:20, Epoch 3/3]\n",
"
\n",
" \n",
" \n",
" \n",
" | Epoch | \n",
" Training Loss | \n",
" Validation Loss | \n",
" Accuracy | \n",
" F1 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 1 | \n",
" No log | \n",
" 1.967664 | \n",
" 0.842903 | \n",
" 0.832499 | \n",
"
\n",
" \n",
" | 2 | \n",
" 3.454100 | \n",
" 0.793594 | \n",
" 0.919677 | \n",
" 0.916521 | \n",
"
\n",
" \n",
" | 3 | \n",
" 1.118200 | \n",
" 0.547787 | \n",
" 0.931935 | \n",
" 0.929519 | \n",
"
\n",
" \n",
"
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"TrainOutput(global_step=1425, training_loss=1.7426024105674343, metrics={'train_runtime': 140.7662, 'train_samples_per_second': 161.971, 'train_steps_per_second': 10.123, 'total_flos': 1501739374694400.0, 'train_loss': 1.7426024105674343, 'epoch': 3.0})"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"trainer = Trainer(\n",
" model=model,\n",
" args=training_args,\n",
" train_dataset=encoded_dataset[\"train\"],\n",
" eval_dataset=encoded_dataset[\"validation\"],\n",
" tokenizer=tokenizer,\n",
" compute_metrics=compute_metrics,\n",
")\n",
"\n",
"trainer.train()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "bb548dc5",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" \n",
"
\n",
" [344/344 00:05]\n",
"
\n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Test Accuracy: 0.8407\n",
"Test F1 Score: 0.8239\n"
]
}
],
"source": [
"# evaluate the model using test dataset\n",
"test_results = trainer.evaluate(encoded_dataset[\"test\"])\n",
"print(f\"Test Accuracy: {test_results['eval_accuracy']:.4f}\")\n",
"print(f\"Test F1 Score: {test_results['eval_f1']:.4f}\") \n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "bad9523e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"('./intent_classifier_tokenizer/tokenizer_config.json',\n",
" './intent_classifier_tokenizer/special_tokens_map.json',\n",
" './intent_classifier_tokenizer/vocab.txt',\n",
" './intent_classifier_tokenizer/added_tokens.json')"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#save the model and tokenizer\n",
"model.save_pretrained(\"./intent_classifier_model\")\n",
"tokenizer.save_pretrained(\"./intent_classifier_tokenizer\")"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "68e9853f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The predicted intent is: Report Fraud\n"
]
}
],
"source": [
"# load the model and tokenizer\n",
"from transformers import BertForSequenceClassification, BertTokenizer \n",
"model = BertForSequenceClassification.from_pretrained(\"./intent_classifier_model\")\n",
"tokenizer = BertTokenizer.from_pretrained(\"./intent_classifier_tokenizer\")\n",
"\n",
"# Example usage of the loaded model and tokenizer\n",
"text = \"I want to take a fraud loan\"\n",
"inputs = tokenizer(text, return_tensors=\"pt\", truncation=True, padding=True, max_length=128)\n",
"outputs = model(**inputs)\n",
"predictions = outputs.logits.argmax(dim=-1)\n",
"predicted_intent = int2str(predictions.item())\n",
"\n",
"if predicted_intent == \"oos\":\n",
" print(\"The intent is out of scope (OOS).\")\n",
"else:\n",
" # preprocessing the predicted intent\n",
" predicted_intent = predicted_intent.replace(\"_\", \" \").title()\n",
" print(f\"The predicted intent is: {predicted_intent}\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "AI",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.23"
}
},
"nbformat": 4,
"nbformat_minor": 5
}