{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Imports" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "# imports\n", "import pandas as pd\n", "import numpy as np\n", "# import matplotlib as plt\n", "import random as rn\n", "import os\n", "os.environ['PYTHONHASHSEED'] = '0'\n", "os.environ['CUDA_VISIBLE_DEVICES'] = ''\n", "np.random.seed(37)\n", "rn.seed(1254)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Load data, train, test, validation splits" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SentenceLabel
S.No.
1Introduction to Quantum Mechanics1.0
2In this chapter, we explore the foundational p...0.0
3The Rise and Fall of Civilizations1.0
4Historical records reveal the complex trajecto...0.0
5Part III: Advanced Mathematical Concepts1.0
\n", "
" ], "text/plain": [ " Sentence Label\n", "S.No. \n", "1 Introduction to Quantum Mechanics 1.0\n", "2 In this chapter, we explore the foundational p... 0.0\n", "3 The Rise and Fall of Civilizations 1.0\n", "4 Historical records reveal the complex trajecto... 0.0\n", "5 Part III: Advanced Mathematical Concepts 1.0" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Label
count198.000000
mean0.555051
std0.313770
min0.000000
25%0.300000
50%0.650000
75%0.800000
max1.000000
\n", "
" ], "text/plain": [ " Label\n", "count 198.000000\n", "mean 0.555051\n", "std 0.313770\n", "min 0.000000\n", "25% 0.300000\n", "50% 0.650000\n", "75% 0.800000\n", "max 1.000000" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "(198, 2)" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# EDA\n", "path_to_data = \"./data/Sentences_200.csv\"\n", "new_data_5_cat = pd.read_csv(path_to_data, index_col='S.No.')\n", "print(type(new_data_5_cat))\n", "display(new_data_5_cat.head())\n", "display(new_data_5_cat.describe())\n", "display(new_data_5_cat.shape)" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Dataset({\n", " features: ['Sentence', 'Label', 'S.No.'],\n", " num_rows: 160\n", "})" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "Dataset({\n", " features: ['Sentence', 'Label', 'S.No.'],\n", " num_rows: 20\n", "})" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "Dataset({\n", " features: ['Sentence', 'Label', 'S.No.'],\n", " num_rows: 18\n", "})" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Make test, train, cv splits\n", "from datasets import Dataset\n", "ds = Dataset.from_pandas(new_data_5_cat)\n", "\n", "ds_train_temp_dict = ds.train_test_split(train_size=160)\n", "ds_train = ds_train_temp_dict['train']\n", "ds_test_cv_dict = ds_train_temp_dict['test'].train_test_split(test_size=20)\n", "ds_cv = ds_test_cv_dict['train']\n", "ds_test = ds_test_cv_dict['test']\n", "display(ds_train)\n", "display(ds_test)\n", "display(ds_cv)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Fine tune LLM" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\Geetansh\\Desktop\\New_folder\\venv\\Lib\\site-packages\\transformers\\convert_slow_tokenizer.py:561: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.\n", " warnings.warn(\n" ] }, { "data": { "text/plain": [ "['▁My', '▁name', '▁is', '▁Geeta', 'n', 'sh', '▁Bhardwaj', '.']" ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Get Tokenizer\n", "from transformers import AutoTokenizer\n", "model_nm = 'microsoft/deberta-v3-small'\n", "tokz = AutoTokenizer.from_pretrained(model_nm)\n", "tokz.tokenize('My name is Geetansh Bhardwaj.')" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Map: 100%|██████████| 160/160 [00:00<00:00, 3348.83 examples/s]\n" ] } ], "source": [ "# Tokenize the 'Sentence' column\n", "def tokenize_string(row):\n", " return tokz(row['Sentence'])\n", "\n", "def tokenize_sentence_col(ds):\n", " '''\n", " We will tokenize the 'Sentence' column and add another column 'Sentence_id'. It will be used for fine-tuning\n", " ds: a dataset with 'Sentence' column\n", " '''\n", "\n", " tokenized_ds = ds.map(tokenize_string, batch_size=5)\n", " return tokenized_ds\n", "\n", "tokenized_ds_train = tokenize_sentence_col(ds_train)" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Map: 100%|██████████| 18/18 [00:00<00:00, 1504.20 examples/s]\n" ] } ], "source": [ "# An undocumented fact: Transformers assume that your label column is named \"labels\". Ours is named \"Label\", so we will change that\n", "tokenized_ds_train = tokenized_ds_train.rename_columns({'Label' : 'labels'})\n", "tokenized_ds_train\n", "\n", "tokenized_ds_cv = tokenize_sentence_col(ds_cv)\n", "tokenized_ds_cv = tokenized_ds_cv.rename_columns({'Label' : 'labels'})" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] } ], "source": [ "# Get the model (We are actually using a pre-trained one)\n", "from transformers import AutoModelForSequenceClassification\n", "my_model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1)" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\Geetansh\\Desktop\\New_folder\\venv\\Lib\\site-packages\\transformers\\training_args.py:1559: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n", " warnings.warn(\n", "C:\\Users\\Geetansh\\AppData\\Local\\Temp\\ipykernel_6212\\1403743469.py:8: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.\n", " trainer = Trainer(my_model, args, train_dataset=tokenized_ds_train, eval_dataset=tokenized_ds_cv,\n" ] } ], "source": [ "from transformers import TrainingArguments, Trainer\n", "bs = 5\n", "epochs = 4\n", "lr = 8e-5\n", "args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,\n", " evaluation_strategy=\"epoch\", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,\n", " num_train_epochs=epochs, weight_decay=0.01, report_to='none')\n", "trainer = Trainer(my_model, args, train_dataset=tokenized_ds_train, eval_dataset=tokenized_ds_cv,\n", " tokenizer=tokz)" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " 0%| | 0/16 [1:22:50\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ab
00.850.855345
10.400.310814
20.800.904199
30.850.871012
40.700.783445
50.300.300442
60.750.934489
70.850.909616
80.700.582580
90.900.936297
100.700.914760
110.200.345520
120.900.773511
130.200.482110
140.400.433981
150.200.279442
160.750.892115
170.300.224499
180.000.252870
190.000.077972
\n", "" ], "text/plain": [ " a b\n", "0 0.85 0.855345\n", "1 0.40 0.310814\n", "2 0.80 0.904199\n", "3 0.85 0.871012\n", "4 0.70 0.783445\n", "5 0.30 0.300442\n", "6 0.75 0.934489\n", "7 0.85 0.909616\n", "8 0.70 0.582580\n", "9 0.90 0.936297\n", "10 0.70 0.914760\n", "11 0.20 0.345520\n", "12 0.90 0.773511\n", "13 0.20 0.482110\n", "14 0.40 0.433981\n", "15 0.20 0.279442\n", "16 0.75 0.892115\n", "17 0.30 0.224499\n", "18 0.00 0.252870\n", "19 0.00 0.077972" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Using MAE to calculate loss\n", "def get_mae(preds, real):\n", " '''\n", " preds, real: array \n", " '''\n", "\n", " mae = np.mean(np.abs(preds - real))\n", " return mae\n", "\n", "real = np.array(tokenized_ds_test['labels'])\n", "\n", "print(f\"MAE: {get_mae(preds, real)}\")\n", "\n", "# Print predictions on test side-by-side\n", "m = pd.DataFrame({'a':real.reshape(20,), 'b':preds.reshape(20)})\n", "m" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# MAE of my model: 0.1 (Based on test set)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Check if your GPU is available" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import torch\n", "torch.cuda.is_available()" ] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.6" } }, "nbformat": 4, "nbformat_minor": 2 }