# Imports

In [1]:
# imports
import pandas as pd
import numpy as np
# import matplotlib as plt
import random as rn
import os
os.environ['PYTHONHASHSEED'] = '0'
os.environ['CUDA_VISIBLE_DEVICES'] = ''
np.random.seed(37)
rn.seed(1254)

# Load data, train, test, validation splits

In [2]:
# EDA
path_to_data = "./data/Sentences_200.csv"
new_data_5_cat = pd.read_csv(path_to_data, index_col='S.No.')
print(type(new_data_5_cat))
display(new_data_5_cat.head())
display(new_data_5_cat.describe())
display(new_data_5_cat.shape)

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0_level_0,Sentence,Label
S.No.,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Introduction to Quantum Mechanics,1.0
2,"In this chapter, we explore the foundational p...",0.0
3,The Rise and Fall of Civilizations,1.0
4,Historical records reveal the complex trajecto...,0.0
5,Part III: Advanced Mathematical Concepts,1.0


Unnamed: 0,Label
count,198.0
mean,0.555051
std,0.31377
min,0.0
25%,0.3
50%,0.65
75%,0.8
max,1.0


(198, 2)

In [3]:
# Make test, train, cv splits
from datasets import Dataset
ds = Dataset.from_pandas(new_data_5_cat)

ds_train_temp_dict = ds.train_test_split(train_size=160)
ds_train = ds_train_temp_dict['train']
ds_test_cv_dict = ds_train_temp_dict['test'].train_test_split(test_size=20)
ds_cv = ds_test_cv_dict['train']
ds_test = ds_test_cv_dict['test']
display(ds_train)
display(ds_test)
display(ds_cv)

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['Sentence', 'Label', 'S.No.'],
    num_rows: 160
})

Dataset({
    features: ['Sentence', 'Label', 'S.No.'],
    num_rows: 20
})

Dataset({
    features: ['Sentence', 'Label', 'S.No.'],
    num_rows: 18
})

# Fine tune LLM

In [None]:
# Get Tokenizer
from transformers import AutoTokenizer
model_nm = 'microsoft/deberta-v3-small'
tokz = AutoTokenizer.from_pretrained(model_nm)
tokz.tokenize('My name is Geetansh Bhardwaj.')



['‚ñÅMy', '‚ñÅname', '‚ñÅis', '‚ñÅGeeta', 'n', 'sh', '‚ñÅBhardwaj', '.']

In [5]:
# Tokenize the 'Sentence' column
def tokenize_string(row):
    return tokz(row['Sentence'])

def tokenize_sentence_col(ds):
    '''
    We will tokenize the 'Sentence' column and add another column 'Sentence_id'. It will be used for fine-tuning
    ds: a dataset with 'Sentence' column
    '''

    tokenized_ds = ds.map(tokenize_string, batch_size=5)
    return tokenized_ds

tokenized_ds_train = tokenize_sentence_col(ds_train)

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 160/160 [00:00<00:00, 4079.69 examples/s]


In [6]:
# An undocumented fact: Transformers assume that your label column is named "labels". Ours is named "Label", so we will change that
tokenized_ds_train = tokenized_ds_train.rename_columns({'Label' : 'labels'})
tokenized_ds_train

tokenized_ds_cv = tokenize_sentence_col(ds_cv)
tokenized_ds_cv = tokenized_ds_cv.rename_columns({'Label' : 'labels'})

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18/18 [00:00<00:00, 2243.01 examples/s]


In [7]:
# Get the model (We are actually using a pre-trained one)
from transformers import AutoModelForSequenceClassification
my_model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
from transformers import TrainingArguments, Trainer
bs = 5
epochs = 4
lr = 8e-5
args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, weight_decay=0.01, report_to='none')
trainer = Trainer(my_model, args, train_dataset=tokenized_ds_train, eval_dataset=tokenized_ds_cv,
                  tokenizer=tokz)




  trainer = Trainer(my_model, args, train_dataset=tokenized_ds_train, eval_dataset=tokenized_ds_cv,


In [9]:
# Train (Here, fine tune) the model
trainer.train()

                                                
 25%|‚ñà‚ñà‚ñå       | 32/128 [00:10<00:26,  3.56it/s]

{'eval_loss': 0.09050914645195007, 'eval_runtime': 0.3554, 'eval_samples_per_second': 50.653, 'eval_steps_per_second': 5.628, 'epoch': 1.0}


                                                
 50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 64/128 [00:19<00:17,  3.68it/s]

{'eval_loss': 0.04030601680278778, 'eval_runtime': 0.3239, 'eval_samples_per_second': 55.567, 'eval_steps_per_second': 6.174, 'epoch': 2.0}


                                                
 76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 97/128 [00:28<00:10,  2.98it/s]

{'eval_loss': 0.022483834996819496, 'eval_runtime': 0.3246, 'eval_samples_per_second': 55.448, 'eval_steps_per_second': 6.161, 'epoch': 3.0}


                                                 
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 128/128 [00:41<00:00,  3.07it/s]

{'eval_loss': 0.0200485959649086, 'eval_runtime': 0.3606, 'eval_samples_per_second': 49.921, 'eval_steps_per_second': 5.547, 'epoch': 4.0}
{'train_runtime': 41.7528, 'train_samples_per_second': 15.328, 'train_steps_per_second': 3.066, 'train_loss': 0.11997667700052261, 'epoch': 4.0}





TrainOutput(global_step=128, training_loss=0.11997667700052261, metrics={'train_runtime': 41.7528, 'train_samples_per_second': 15.328, 'train_steps_per_second': 3.066, 'total_flos': 1818871829700.0, 'train_loss': 0.11997667700052261, 'epoch': 4.0})

In [10]:
# Report loss for your model using the test set
tokenized_ds_test = tokenize_sentence_col(ds_test)
tokenized_ds_test = tokenized_ds_test.rename_columns({'Label' : 'labels'})

preds = trainer.predict(tokenized_ds_test).predictions.astype(float)
preds

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:00<00:00, 162.84 examples/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:00, 13.26it/s]


array([0.86230469, 0.28979492, 0.91162109, 0.86816406, 0.87988281,
       0.21826172, 0.91064453, 0.89013672, 0.41748047, 0.8984375 ,
       0.89355469, 0.14257812, 0.89160156, 0.35131836, 0.34375   ,
       0.23815918, 0.87841797, 0.20471191, 0.10784912, 0.02485657])

In [11]:
# Using MAE to calculate loss
def get_mae(preds, real):
    '''
    preds, real: array 
    '''

    mae = np.mean(np.abs(preds - real))
    return mae

real = np.array(tokenized_ds_test['labels'])

print(f"MAE: {get_mae(preds, real)}")

# Print predictions on test side-by-side
m = pd.DataFrame({'a':real.reshape(20,), 'b':preds.reshape(20)})
m

MAE: 0.09301467895507813


Unnamed: 0,a,b
0,0.85,0.862305
1,0.4,0.289795
2,0.8,0.911621
3,0.85,0.868164
4,0.7,0.879883
5,0.3,0.218262
6,0.75,0.910645
7,0.85,0.890137
8,0.7,0.41748
9,0.9,0.898438


In [12]:
# MAE of my model: 0.1 (Based on test set)

# Check if your GPU is available

In [5]:
import torch
torch.cuda.is_available()

True

# Try Exporting the model

#### How to pass input to the model for inference

In [None]:
import torch

# Use GPU if available, otherwise fall back to CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Move the model to the same device
my_model.to(device)

# Tokenize input and ensure tensors are returned
sentence = "Hey, it's Geetansh"
output = tokz(sentence, return_tensors='pt')

# Move input tensors to the same device as the model
output = {key: val.to(device) for key, val in output.items()}
# print(output)

# Set model to evaluation mode
my_model.eval()

# Perform inference without tracking gradients
with torch.no_grad():
    # Pass tokenized input to the model
    predictions = my_model(**output)

# Print predictions
print(predictions)


SequenceClassifierOutput(loss={'logits': tensor([[0.6899]], device='cuda:0')}, logits=tensor([[0.6899]], device='cuda:0'), hidden_states=None, attentions=None)


### Method 1

In [None]:
# Save the model and tokeniser to disk
save_dir = "./saved_model"
# tokz.save_pretrained(save_directory=save_dir)
# my_model.save_pretrained(save_directory=save_dir)

# Use GPU if available, otherwise fall back to CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the saved model and tokeniser from the disk 
from transformers import AutoTokenizer, AutoModelForSequenceClassification
loaded_tokeniser = AutoTokenizer.from_pretrained(save_dir)
loaded_model = AutoModelForSequenceClassification.from_pretrained(save_dir)

loaded_model.to(device)

# Test with the dummy input
# Create a dummy input (same structure as your tokenizer output)
dummy_input = loaded_tokeniser("This is a test sentence.", return_tensors='pt')
dummy_input = {key: val.to(device) for key, val in dummy_input.items()}

with torch.no_grad():
    output = loaded_model(**dummy_input)
print(output)   

SequenceClassifierOutput(loss=None, logits=tensor([[0.3520]], device='cuda:0'), hidden_states=None, attentions=None)


### Method 2

In [None]:
# Save the model and tokeniser to disk
save_dir = "./saved_model2"
# trainer.save_model(save_dir)

# Use GPU if available, otherwise fall back to CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the saved model and tokeniser from the disk 
from transformers import AutoTokenizer, AutoModelForSequenceClassification
loaded_tokeniser = AutoTokenizer.from_pretrained(save_dir)
loaded_model = AutoModelForSequenceClassification.from_pretrained(save_dir)

loaded_model.to(device)

# Test with the same dummy input as before
# Create a dummy input (same structure as your tokenizer output)
dummy_input = loaded_tokeniser("This is a test sentence.", return_tensors='pt')
dummy_input = {key: val.to(device) for key, val in dummy_input.items()}

with torch.no_grad():
    output = loaded_model(**dummy_input)
print(output)  

SequenceClassifierOutput(loss=None, logits=tensor([[0.3520]], device='cuda:0'), hidden_states=None, attentions=None)
