In [1]:
#Learned from https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb#scrollTo=j9TNdHlQ0CLz

### ***Install Libraries***

In [2]:
!pip install transformers -q
!pip install wandb -q
!pip install datasets
!pip install peft
!pip install peft accelerate loralib --upgrade --quiet
!pip install torchsummary
!pip install rouge-score



[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.1/266.1 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_

### ***Import Libraries***

In [3]:
# Importing stock libraries
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import load_dataset

# WandB – Import the wandb library
import wandb

# import PEFT and LoRA Stuffs
from peft import LoraConfig, get_peft_model, TaskType

from torchsummary import summary

from rouge_score import rouge_scorer

# Datetime for adding timestamps to training outputs
from datetime import datetime
import pytz

# General use like saving models
import os




### ***Import Dataset***

In [4]:
# load in the pubmed version of the dataset
dataset = load_dataset("scientific_papers", 'pubmed')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/5.35k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.27k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.62G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/880M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/119924 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6633 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6658 [00:00<?, ? examples/s]

### ***Check GPU***


In [5]:
!nvidia-smi



Sat Apr  6 21:17:59 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-SXM2-16GB           Off | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0              26W / 300W |      0MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

### ***Use GPU***

In [6]:
# # Setting up the device for GPU usage
from torch import cuda
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print ("Using: ", device)

Using:  cuda


### ***Use wandb***

In [7]:
#!wandb login


### Function for saving pretrained model and tokenizer

In [26]:
def save_to_hf(model, tokenizer, model_name='saved_model'):

  # create folder to save model and tokenizer to
  cwd = os.getcwd()
  model_save_path = os.path.join(cwd, model_name)
  os.makedirs(model_save_path, exist_ok=False)

  # save model to local instance of colab
  model.save_pretrained(model_save_path)

  # save tokenizer to local instance of colab
  tokenizer.save_pretrained(model_save_path)

  # login to hugging face to upload
  !huggingface-cli login

  # upload model to hugging face
  !huggingface-cli upload {model_name}

### ***Custom Dataset Class***

In [9]:
# Creating a custom dataset for reading the dataframe and loading it into the dataloader to pass it to the neural network at a later stage for finetuning the model and to prepare it for predictions

class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.text = self.data.abstract
        self.ctext = self.data.article

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        ctext = str(self.ctext[index])
        ctext = ' '.join(ctext.split())

        text = str(self.text[index])
        text = ' '.join(text.split())

        source = self.tokenizer.batch_encode_plus([ctext], max_length= self.source_len, pad_to_max_length=True,return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([text], max_length= self.summ_len, pad_to_max_length=True,return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long),
            'source_mask': source_mask.to(dtype=torch.long),
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

### ***Training Function***

In [None]:
def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        labels = y[:, 1:].clone().detach()
        labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=labels)
        loss = outputs[0]

        if _%100==0:
          print ("RUN: " + str(_))
          timestamp = datetime.now(pytz.timezone('America/Los_Angeles')).strftime('%Y-%m-%d %H:%M:%S %Z%z')
          print(f"\t[{timestamp}]")
          #print(f'Epoch: {epoch}, Loss:  {loss.item()}')
          print(f'Epoch: {epoch}, Batch Size: {ids.size(0)}, Loss:  {loss.item()}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

### ***Additional Layers Class***

In [None]:
import torch.nn.functional as F

class CustomLinearLayer(nn.Module):
    def __init__(self, input_size, output_size, activation=None):
        super(CustomLinearLayer, self).__init__()
        self.linear = nn.Linear(input_size, output_size)
        self.activation = activation

    def forward(self, x):
        x = self.linear(x)
        if self.activation is not None:
            x = self.activation(x)
        return x


### ***Turned Training Function from Previous Cell into a class***

In [12]:


class T5AbstractsTrainer:
    def __init__(self, tokenizer, model, device, optimizer):
        self.tokenizer = tokenizer
        self.model = model
        self.device = device
        self.optimizer = optimizer


        #attempt to add custom layer --- i think it works? or nor? SUS
        self.linear_layer = nn.Linear(32128, 500)
        #self.linear_layer = nn.Linear(15, 500)
        self.activation = nn.Tanh()





    def forward(self, ids, mask, y_ids, labels):
      outputs = self.model(input_ids=ids, attention_mask=mask, decoder_input_ids=y_ids, labels=labels)
      loss, logits = outputs.loss, outputs.logits

      # Apply custom linear layer and activation
      logits = logits.view(-1, logits.size(-1))  # Reshape logits
      linear_output = self.linear_layer(logits)
      tanh_output = self.activation(linear_output).squeeze(-1)

      # return loss, tanh_output
      return loss, tanh_output




    def train(self, epoch, loader):
        self.model.train()
        self.linear_layer.to(self.device)
        self.activation.to(self.device)


        # Print model layers
        #print("Model Layers:")
        #for name, layer in self.model.named_children():
          #print(name, layer)


        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(self.device, dtype=torch.long)
            y_ids = y[:, :-1].contiguous()
            labels = y[:, 1:].clone().detach()
            labels[y[:, 1:] == self.tokenizer.pad_token_id] = -100
            ids = data['source_ids'].to(self.device, dtype=torch.long)
            mask = data['source_mask'].to(self.device, dtype=torch.long)

            outputs = self.model(input_ids=ids, attention_mask=mask, decoder_input_ids=y_ids, labels=labels)
            loss = outputs[0]



            if _ % 100 == 0:
                print("RUN:", _)
                timestamp = datetime.now(pytz.timezone('America/Los_Angeles')).strftime('%Y-%m-%d %H:%M:%S %Z%z')
                print(f"\t[{timestamp}]")
                print(f'Epoch: {epoch}, Batch Size: {ids.size(0)}, Loss: {loss.item()}')

            logits = outputs.logits.to(self.device)
            logits = logits.view(-1, logits.size(-1))  # Reshape logits to (batch_size * sequence_length, vocab_size) -- for linear layer


            #print("Device of inputs (ids, mask, y_ids, labels):", ids.device, mask.device, y_ids.device, labels.device)
            #print("Device of outputs.logits:", logits.device)
            #print("Device of linear layer weight:", self.linear_layer.weight.device)
            #print("Device of linear layer bias:", self.linear_layer.bias.device)





            #print("Output shape before linear layer and Tanh activation:", logits.shape)
            linear_output = self.linear_layer(logits).to(self.device)
            tanh_output = self.activation(linear_output).to(self.device).squeeze(-1)

            #print("tanh_output shape after linear layer and Tanh activation:", tanh_output.shape)

            #print ("AFTER LINEAR LAYER", outputs.logits.shape)

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()


### ***Create Training Data Arrays***

In [10]:
#Set the 2 arrays
#Prep arrays
import pandas as pd

train_set_articles = [data['article'] for data in dataset['train']]
train_set_abstracts = [data['abstract'] for data in dataset['train']]
val_set_articles = [data['article'] for data in dataset['validation']]
val_set_abstracts = [data['abstract'] for data in dataset['validation']]
test_set_articles = [data['article'] for data in dataset['test']]
test_set_abstracts = [data['abstract'] for data in dataset['test']]

# Create DataFrame for training data
df_train = pd.DataFrame({'article': train_set_articles, 'abstract': train_set_abstracts})

# Add prefix to the article column in the training DataFrame
df_train['article'] = 'summarize: ' + df_train['article']

# Create DataFrame for validation data
df_val = pd.DataFrame({'article': val_set_articles, 'abstract': val_set_abstracts})

# Add prefix to the article column in the validation DataFrame
df_val['article'] = 'summarize: ' + df_val['article']

# Create DataFrame for test data
df_test = pd.DataFrame({'article': test_set_articles, 'abstract': test_set_abstracts})

# Add prefix to the article column in the test DataFrame
df_test['article'] = 'summarize: ' + df_test['article']


### ***MAIN Code to train and then predict***

In [None]:
#PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

# Set random seeds and deterministic pytorch for reproducibility
torch.manual_seed(42) # pytorch random seed
np.random.seed(42) # numpy random seed
torch.backends.cudnn.deterministic = True
# tokenzier for encoding the text
tokenizer = T5Tokenizer.from_pretrained("t5-small")
#print (train_set_articles[0])
#print ("hello")

# Creating the Training and Validation dataset for further creation of Dataloader
training_set = CustomDataset(df_train, tokenizer, 512, 50)
val_set = CustomDataset(df_val, tokenizer, 512, 50)

# Defining the parameters for creation of dataloaders
train_params = {
    'batch_size': 8,
    'shuffle': True,
    'num_workers': 0
    }

val_params = {
    'batch_size': 8,
    'shuffle': False,
    'num_workers': 0
    }


# Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)


# Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary.
# Further this model is sent to device (GPU/TPU) for using the hardware.
model = T5ForConditionalGeneration.from_pretrained("t5-small")
model = model.to(device)

# Defining the optimizer that will be used to tune the weights of the network in the training session.
optimizer = torch.optim.Adam(params =  model.parameters(), lr=1e-5) # 1e-4 ended with loss Loss:  2.074270486831665, now trying e-5 -- got 2.3540332317352295 ---> so far best lr seems to be 1e-4

EPOCHS = 1

for epoch in range(EPOCHS):
        train(epoch, tokenizer, model, device, training_loader, optimizer)





# ***Playground code***

In [None]:
keys = dataset['train'][0].keys()
print(keys)

print (dataset['train'][0])
# Now call the print_model function with your model instance
#print_model(model)


dict_keys(['article', 'abstract', 'section_names'])
{'article': "a recent systematic analysis showed that in 2011 , 314 ( 296 - 331 ) million children younger than 5 years were mildly , moderately or severely stunted and 258 ( 240 - 274 ) million were mildly , moderately or severely underweight in the developing countries .\nin iran a study among 752 high school girls in sistan and baluchestan showed prevalence of 16.2% , 8.6% and 1.5% , for underweight , overweight and obesity , respectively .\nthe prevalence of malnutrition among elementary school aged children in tehran varied from 6% to 16% .\nanthropometric study of elementary school students in shiraz revealed that 16% of them suffer from malnutrition and low body weight .\nsnack should have 300 - 400 kcal energy and could provide 5 - 10 g of protein / day . nowadays , school nutrition programs are running as the national programs , world - wide . national school lunch program in the united states\nthere are also some reports reg

### ***Peft/LORA Code to train***

In [13]:
# Set random seeds and deterministic pytorch for reproducibility
#torch.manual_seed(42) # pytorch random seed
np.random.seed(42) # numpy random seed
torch.backends.cudnn.deterministic = True

# tokenzier for encoding the text
tokenizer = T5Tokenizer.from_pretrained("t5-base")


MAX_LEN = 700
SUMMARY_LEN = 150
# Creating the Training and Validation dataset for further creation of Dataloader
training_set = CustomDataset(df_train, tokenizer, MAX_LEN, SUMMARY_LEN)
val_set = CustomDataset(df_val, tokenizer, MAX_LEN, SUMMARY_LEN)
BATCHSIZE = 7
# Defining the parameters for creation of dataloaders
train_params = {
    'batch_size': BATCHSIZE,
    'shuffle': True,
    'num_workers': 0
    }

val_params = {
    'batch_size': BATCHSIZE,
    'shuffle': False,
    'num_workers': 0
    }


# Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)

# define configuration for peft (source: https://www.philschmid.de/fine-tune-flan-t5-peft, https://huggingface.co/blog/peft)
peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM,
                               inference_mode=False,
                               r=8,
                               lora_alpha=32,
                               lora_dropout=0.1)
# lora_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM,
#                               inference_mode=False,
#                               r=16,
#                               bias='lora_only',
#                               use_rslora=True,
#                               lora_dropout=0.1,
#                               )

# Defining the  PEFT model. We are using t5-base model and added a Language model layer on top for generation of Summary.
# Further this model is sent to device (GPU/TPU) for using the hardware.
model = get_peft_model(T5ForConditionalGeneration.from_pretrained('t5-base',output_hidden_states=True), peft_config)
model.to(device)

# take a peek at the peft model for comparison to non-peft
print(f"Preview of trainable parameters in the peft model\n")
model.print_trainable_parameters()

# Defining the optimizer that will be used to tune the weights of the network in the training session.
optimizer = torch.optim.Adam(params =  model.parameters(), lr=3e-4) # 1e-4 ended with loss Loss:  2.074270486831665, now trying e-5 -- got 2.3540332317352295 ---> so far best lr seems to be 1e-4. 3e-4 got sub 2 loss, so pretty decent

#summary(model, input_size=[(batch_size, input_length)])
#summary(model, input_size=[(8, 6)])
# print (model)


EPOCHS = 1
trainer = T5AbstractsTrainer(tokenizer, model, device, optimizer)
for epoch in range(EPOCHS):
    trainer.train(epoch, training_loader)




#for epoch in range(EPOCHS):
#        train(epoch, tokenizer, model, device, training_loader, optimizer)






Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Preview of trainable parameters in the peft model

trainable params: 884,736 || all params: 223,788,288 || trainable%: 0.3953450861557152




RUN: 0
	[2024-04-06 14:18:53 PDT-0700]
Epoch: 0, Batch Size: 7, Loss: 7.345491409301758
RUN: 100
	[2024-04-06 14:19:33 PDT-0700]
Epoch: 0, Batch Size: 7, Loss: 2.5926971435546875
RUN: 200
	[2024-04-06 14:20:11 PDT-0700]
Epoch: 0, Batch Size: 7, Loss: 2.7770352363586426
RUN: 300
	[2024-04-06 14:20:50 PDT-0700]
Epoch: 0, Batch Size: 7, Loss: 2.458217144012451
RUN: 400
	[2024-04-06 14:21:29 PDT-0700]
Epoch: 0, Batch Size: 7, Loss: 2.6213810443878174
RUN: 500
	[2024-04-06 14:22:07 PDT-0700]
Epoch: 0, Batch Size: 7, Loss: 2.6450307369232178
RUN: 600
	[2024-04-06 14:22:46 PDT-0700]
Epoch: 0, Batch Size: 7, Loss: 2.1673409938812256
RUN: 700
	[2024-04-06 14:23:25 PDT-0700]
Epoch: 0, Batch Size: 7, Loss: 2.455289363861084
RUN: 800
	[2024-04-06 14:24:04 PDT-0700]
Epoch: 0, Batch Size: 7, Loss: 2.2111077308654785
RUN: 900
	[2024-04-06 14:24:44 PDT-0700]
Epoch: 0, Batch Size: 7, Loss: 2.5103116035461426
RUN: 1000
	[2024-04-06 14:25:22 PDT-0700]
Epoch: 0, Batch Size: 7, Loss: 2.1050968170166016
RUN

### Testing save to hugging face function

In [14]:
# save_to_hf(trainer.model, tokenizer, model_name='t5_base_peft')

2024-04-06 23:09:35.453978: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-06 23:09:35.454022: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-06 23:09:35.455334: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[1m[31mERROR! `huggingface-cli login` uses an outdated login mechanism that is not compatible with the Hugging Face Hub backend anymore. Please use `huggingface-cli login instead.[0m
2024-04-06 23:09:45.318460: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when 

In [43]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# load model from huggingface
hf_model = AutoModelForSeq2SeqLM.from_pretrained("dsolomon/t5_base_peft")
hf_tokenizer = AutoTokenizer.from_pretrained("dsolomon/t5_base_peft")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


False

### ***Validate***

In [None]:
def validate(epoch, tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask,
                min_length=200,
                max_length=350,
                num_beams=4,
                repetition_penalty=2.5,
                length_penalty=1.0,
                early_stopping=True
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            #if _%100==0:
                #print(f'Completed {_}')

            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals



### ***Validate with Rouge score***

In [None]:

def validate(epoch, tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    rouge_scores = {'rouge-1': {'f': [], 'p': [], 'r': []}, 'rouge-2': {'f': [], 'p': [], 'r': []}}  # Store ROUGE scores

    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2'], use_stemmer=True)

    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype=torch.long)
            ids = data['source_ids'].to(device, dtype=torch.long)
            mask = data['source_mask'].to(device, dtype=torch.long)

            generated_ids = model.generate(
                input_ids=ids,
                attention_mask=mask,
                min_length=200,
                max_length=350,
                num_beams=4,
                repetition_penalty=2.5,
                length_penalty=1.0,
                early_stopping=True
            )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True) for t in y]

            # Compute ROUGE scores for each pair of predictions and targets
            for pred, tgt in zip(preds, target):
                scores = scorer.score(pred, tgt)
                rouge_scores['rouge-1']['f'].append(scores['rouge1'].fmeasure)
                rouge_scores['rouge-1']['p'].append(scores['rouge1'].precision)
                rouge_scores['rouge-1']['r'].append(scores['rouge1'].recall)
                rouge_scores['rouge-2']['f'].append(scores['rouge2'].fmeasure)
                rouge_scores['rouge-2']['p'].append(scores['rouge2'].precision)
                rouge_scores['rouge-2']['r'].append(scores['rouge2'].recall)

            predictions.extend(preds)
            actuals.extend(target)

    return predictions, actuals, rouge_scores



### ***Generate Predictions***

In [None]:
final_df = {}

#just to check the first one
val_set = CustomDataset(df_val.head(1), tokenizer, 512, 50)
print ("HEAD is : ", df_val.head(1))
val_loader = DataLoader(val_set, **val_params)



for epoch in range(0,EPOCHS):
  predictions, actuals, rscore = validate(epoch, tokenizer, model, device, val_loader)
  final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})



HEAD is :                                               article  \
0  summarize: approximately , one - third of pati...   

                                            abstract  
0   background and aim : there is lack of substan...  




In [None]:
print("Hello")
print(final_df['Actual Text'])

print(final_df['Generated Text'][:100])
print(len(final_df['Generated Text']))
print (final_df.info())
print (final_df.iloc[0,0])
print (len(final_df.iloc[0,0]))

print("Rouge Score is: ", rscore)

Hello
0    background and aim : there is lack of substant...
Name: Actual Text, dtype: object
0    vte is one of the commonest causes of sudden u...
Name: Generated Text, dtype: object
1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Generated Text  1 non-null      object
 1   Actual Text     1 non-null      object
dtypes: object(2)
memory usage: 144.0+ bytes
None
vte is one of the commonest causes of sudden unexplained deaths in hospitalized patients. it is not only disabling but also prolongs hospital stay and increases the cost of treatment. there was a need to systematically collect such data on patient characteristics, clinical outcomes, predictors of mortality in acute vte, management strategies and temporal trends in vte.materials and methodswe collected consecutive medical records of inpatients and outpatients between january 2006 and

In [None]:


print(final_df['Actual Text'])


print (len(final_df['Generated Text']))
print(final_df['Generated Text'])


In [None]:
model

## Experimenting with a longer max length

In [None]:
# Set random seeds and deterministic pytorch for reproducibility
torch.manual_seed(42) # pytorch random seed
np.random.seed(42) # numpy random seed
torch.backends.cudnn.deterministic = True

# tokenzier for encoding the text
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# DO NOT SET THESE TOO HIGH OR IT WILL ABSOLUTELY SHIT ON THE GPU AND YOU WILL HAVE TO RESTART THE NOTEBOOK
# USAGE WITH THESE VALUES ARE ~7GB SYSTEM RAM AND 8.6GB GPU RAM
source_len = 1000
target_len = 200

# Creating the Training and Validation dataset for further creation of Dataloader
training_set = CustomDataset(df_train, tokenizer, source_len, target_len)
val_set = CustomDataset(df_val, tokenizer, source_len, target_len)

# Defining the parameters for creation of dataloaders
train_params = {
    'batch_size': 8,
    'shuffle': True,
    'num_workers': 0
    }

val_params = {
    'batch_size': 8,
    'shuffle': False,
    'num_workers': 0
    }


# Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)

# define configuration for peft (source: https://www.philschmid.de/fine-tune-flan-t5-peft, https://huggingface.co/blog/peft)
# peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM,
#                                inference_mode=False,
#                                r=8,
#                                lora_alpha=32,
#                                lora_dropout=0.1)
lora_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM,
                              inference_mode=False,
                              r=16,
                              bias='lora_only',
                              use_rslora=True,
                              lora_dropout=0.1,
                              )

# Defining the  PEFT model. We are using t5-base model and added a Language model layer on top for generation of Summary.
# Further this model is sent to device (GPU/TPU) for using the hardware.
model = get_peft_model(T5ForConditionalGeneration.from_pretrained('t5-small',output_hidden_states=True), peft_config)
model.to(device)

# take a peek at the peft model for comparison to non-peft
print(f"Preview of trainable parameters in the peft model\n")
model.print_trainable_parameters()

# Defining the optimizer that will be used to tune the weights of the network in the training session.
optimizer = torch.optim.Adam(params =  model.parameters(), lr=3e-4) # 1e-4 ended with loss Loss:  2.074270486831665, now trying e-5 -- got 2.3540332317352295 ---> so far best lr seems to be 1e-4. 3e-4 got sub 2 loss, so pretty decent

#summary(model, input_size=[(batch_size, input_length)])
#summary(model, input_size=[(8, 6)])
# print (model)


EPOCHS = 1
trainer = T5AbstractsTrainer(tokenizer, model, device, optimizer)
for epoch in range(EPOCHS):
    trainer.train(epoch, training_loader)




#for epoch in range(EPOCHS):
#        train(epoch, tokenizer, model, device, training_loader, optimizer)






You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Preview of trainable parameters in the peft model

trainable params: 294,912 || all params: 60,801,536 || trainable%: 0.4850403779272945
PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(32128, 512)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32128, 512)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear(
                    (base_layer): Linear(in_features=512, out_features=512, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=512, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(i



RUN: 0
	[2024-04-03 00:55:40 PDT-0700]
Epoch: 0, Batch Size: 8, Loss: 5.606602668762207
RUN: 100
	[2024-04-03 00:56:07 PDT-0700]
Epoch: 0, Batch Size: 8, Loss: 2.9735965728759766
RUN: 200
	[2024-04-03 00:56:32 PDT-0700]
Epoch: 0, Batch Size: 8, Loss: 2.890866279602051
RUN: 300
	[2024-04-03 00:56:59 PDT-0700]
Epoch: 0, Batch Size: 8, Loss: 3.4168484210968018
RUN: 400
	[2024-04-03 00:57:25 PDT-0700]
Epoch: 0, Batch Size: 8, Loss: 2.409773111343384
RUN: 500
	[2024-04-03 00:57:51 PDT-0700]
Epoch: 0, Batch Size: 8, Loss: 2.6389687061309814
RUN: 600
	[2024-04-03 00:58:16 PDT-0700]
Epoch: 0, Batch Size: 8, Loss: 3.0353121757507324
RUN: 700
	[2024-04-03 00:58:42 PDT-0700]
Epoch: 0, Batch Size: 8, Loss: 2.7438268661499023
RUN: 800
	[2024-04-03 00:59:09 PDT-0700]
Epoch: 0, Batch Size: 8, Loss: 3.5309488773345947
RUN: 900
	[2024-04-03 00:59:35 PDT-0700]
Epoch: 0, Batch Size: 8, Loss: 3.0657596588134766
RUN: 1000
	[2024-04-03 01:00:02 PDT-0700]
Epoch: 0, Batch Size: 8, Loss: 3.2870266437530518
RUN

In [None]:
!nvidia-smi

Wed Apr  3 07:48:10 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-SXM2-16GB           Off | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0              24W / 300W |      2MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
# wipes the GPU memory without having to restart colab but now nothing works :(
# !pip install numba

# import numba
# cuda_device = numba.cuda.get_current_device()
# cuda_device.reset()



# Redefining T5AbstractsTrainer Class

In [11]:
T5_MODEL = 'google-t5/t5-base'

In [12]:
# removing additional linear layer and tanh activation function
# idea at this point is to finetune the t5 model on the data and then create another model that calls the pretrained t5 and then uses the outputs of the pretrained t5 to train additional layers on top of that

class T5AbstractsTrainer:
    def __init__(self, tokenizer, model, device, optimizer):
        self.tokenizer = tokenizer
        self.model = model
        self.device = device
        self.optimizer = optimizer



    def forward(self, ids, mask, y_ids, labels):
      outputs = self.model(input_ids=ids, attention_mask=mask, decoder_input_ids=y_ids, labels=labels)
      loss, logits = outputs.loss, outputs.logits

      # return loss, tanh_output
      return loss, logits




    def train(self, epoch, loader):
        self.model.train()

        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(self.device, dtype=torch.long)
            y_ids = y[:, :-1].contiguous()
            labels = y[:, 1:].clone().detach()
            labels[y[:, 1:] == self.tokenizer.pad_token_id] = -100
            ids = data['source_ids'].to(self.device, dtype=torch.long)
            mask = data['source_mask'].to(self.device, dtype=torch.long)

            outputs = self.model(input_ids=ids, attention_mask=mask, decoder_input_ids=y_ids, labels=labels)
            loss = outputs[0]

            if _ % 100 == 0:
                print("RUN:", _)
                timestamp = datetime.now(pytz.timezone('America/Los_Angeles')).strftime('%Y-%m-%d %H:%M:%S %Z%z')
                print(f"\t[{timestamp}]")
                print(f'Epoch: {epoch}, Batch Size: {ids.size(0)}, Loss: {loss.item()}')

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()


In [None]:
# removing additional linear layer and tanh activation function
# idea at this point is to finetune the t5 model on the data and then create another model that calls the pretrained t5 and then uses the outputs of the pretrained t5 to train additional layers on top of that

class YuriTheTrainer:
    def __init__(self, tokenizer, t5_model, device, optimizer):
        self.tokenizer = tokenizer
        self.t5_model = t5_model
        self.device = device
        self.optimizer = optimizer
        self.criterion = nn.CrossEntropyLoss()

        # additional layer parameters
        self.MIN_LENGTH = 100
        self.MAX_LENGTH = 300

        # T5 Embedding has shape (32128, 768)
        self.i_haf_to_travel_to_asia = nn.Sequential(nn.Conv1d(32128, 32128, 4, bias=True),
                                                     nn.Dropout(0.1),
                                                     nn.Tanh())



    def forward(self, ids, mask, y_ids, labels):
      outputs = self.t5_model(input_ids=ids, attention_mask=mask, decoder_input_ids=y_ids, labels=labels)
      loss, logits = outputs.loss, outputs.logits

      # return loss, tanh_output
      return loss, logits


    def train(self, epoch, loader):
        self.model.train()

        for _, data in enumerate(loader, 0):
          # Get target data from dataloader and send to gpu as a torch datatype
          y = data['target_ids'].to(self.device, dtype=torch.long)

          # reformat target data ids
          y_ids = y[:, :-1].contiguous()

          # define and reformat labels and add missing token id
          labels = y[:, 1:].clone().detach()
          labels[y[:, 1:] == self.tokenizer.pad_token_id] = -100

          # Get source data ids and mask from dataloader and send to gpu as a torch datatype
          ids = data['source_ids'].to(self.device, dtype=torch.long)
          mask = data['source_mask'].to(self.device, dtype=torch.long)

          # Get outputs from model
          # outputs = self.t5_model(input_ids=ids, attention_mask=mask, decoder_input_ids=y_ids, labels=labels)
          t5_generate = model.generate(input_ids=ids.expand(1,-1), min_length=self.MIN_LENGTH, max_length=self.MAX_LENGTH)
          loss = outputs[0]

          loss = self.criterion(y_pred, y)

          if _ % 100 == 0:
              print("RUN:", _)
              timestamp = datetime.now(pytz.timezone('America/Los_Angeles')).strftime('%Y-%m-%d %H:%M:%S %Z%z')
              print(f"\t[{timestamp}]")
              print(f'Epoch: {epoch}, Batch Size: {ids.size(0)}, Loss: {loss.item()}')

          self.optimizer.zero_grad()
          loss.backward()
          self.optimizer.step()


## Train T5AbtractsTrainer

In [None]:
# Set random seeds and deterministic pytorch for reproducibility
#torch.manual_seed(42) # pytorch random seed
np.random.seed(42) # numpy random seed
torch.backends.cudnn.deterministic = True

# tokenzier for encoding the text
tokenizer = T5Tokenizer.from_pretrained(T5_MODEL)

# tokenization hyperparameter definitions
MAX_LEN = 700
SUMMARY_LEN = 150

# Creating the Training and Validation dataset for further creation of Dataloader
training_set = CustomDataset(df_train, tokenizer, MAX_LEN, SUMMARY_LEN)
val_set = CustomDataset(df_val, tokenizer, MAX_LEN, SUMMARY_LEN)

# Defining the parameters for creation of dataloaders
BATCHSIZE = 7
train_params = {
    'batch_size': BATCHSIZE,
    'shuffle': True,
    'num_workers': 0
    }

val_params = {
    'batch_size': BATCHSIZE,
    'shuffle': False,
    'num_workers': 0
    }


# Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)

# define configuration for peft (source: https://www.philschmid.de/fine-tune-flan-t5-peft, https://huggingface.co/blog/peft)
lora_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM,
                               inference_mode=False,
                               r=8,
                               lora_alpha=32,
                               lora_dropout=0.1)

# base T5 model from hugging face
t5_model = T5ForConditionalGeneration.from_pretrained(T5_MODEL,output_hidden_states=True,return_dict=True)

# Defining the  PEFT model. We are using t5-base model and added a Language model layer on top for generation of Summary.
# Further this model is sent to device (GPU/TPU) for using the hardware.
model = get_peft_model(t5_model, lora_config)
model.to(device)

# take a peek at the peft model for comparison to non-peft
print(f"Preview of trainable parameters in the peft model\n")
model.print_trainable_parameters()

# Defining the optimizer that will be used to tune the weights of the network in the training session.
optimizer = torch.optim.Adam(params = model.parameters(), lr=3e-4) # 1e-4 ended with loss Loss:  2.074270486831665, now trying e-5 -- got 2.3540332317352295 ---> so far best lr seems to be 1e-4. 3e-4 got sub 2 loss, so pretty decent

#summary(model, input_size=[(batch_size, input_length)])
#summary(model, input_size=[(8, 6)])
# print (model)

# train the model for 2 epochs





#for epoch in range(EPOCHS):
#        train(epoch, tokenizer, model, device, training_loader, optimizer)






spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Preview of trainable parameters in the peft model

trainable params: 884,736 || all params: 223,788,288 || trainable%: 0.3953450861557152


In [None]:
# take a peaky poo at the model to verify layer sizes and such
t5trainer = T5AbstractsTrainer(tokenizer, model, device, optimizer)

out = t5_model(input_ids=training_set[0]['source_ids'].expand(1,-1), decoder_input_ids=training_set[0]['source_ids'].expand(1,-1))

In [None]:
gen_out = model.generate(input_ids=training_set[0]['source_ids'].expand(1,-1), min_length=100, max_length=300)



In [None]:
EPOCHS = 1
trainer = T5AbstractsTrainer(tokenizer, model, device, optimizer)
for epoch in range(EPOCHS):
    trainer.train(epoch, training_loader)

# Train YuriTheTrainer with fine tuned T5AbtractsTrainer

In [None]:
EPOCHS = 1
yuri = YuriTheTrainer(tokenizer, model=trainer, device, optimizer)
for epoch in range(EPOCHS):
    yuri.train(epoch, training_loader)

# Straight Outta SF

In [None]:

class Conv1DLayer(nn.Module):
    def __init__(self, input_size, output_size, kernel_size):
        super(Conv1DLayer, self).__init__()
        self.conv1d = nn.Conv1d(input_size, output_size, kernel_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        # x: [batch_size, sequence_length, input_size]
        x = x.permute(0, 2, 1)  # Conv1d expects input in the format: [batch_size, input_size, sequence_length]
        x = self.conv1d(x)
        x = self.relu(x)
        return x

class T5WithConv1D(nn.Module):
    def __init__(self, conv_config):
        super(T5WithConv1D, self).__init__()

        self.t5_model = T5Model.from_config(T5_MODEL)

        self.conv1d_layer = Conv1DLayer(conv_config['input_size'],
                                        conv_config['output_size'],
                                        conv_config['kernel_size'])

    def forward(self, input_ids, attention_mask):
        # T5 forward pass
        t5_outputs = self.t5_model(input_ids=input_ids,
                                   attention_mask=attention_mask,
                                   output_hidden_states=True,
                                   return_dict=True)

        last_hidden_state = t5_outputs.last_hidden_state  # shape: [batch_size, sequence_length, hidden_size]

        # Apply Conv1D layer
        conv_output = self.conv1d_layer(last_hidden_state)

        return conv_output

conv_config = {
    'input_size': t5_model_config['d_model'],
    'output_size': t5_model_config['d_model'],
    'kernel_size': 4
}

t5_with_conv1d = T5WithConv1D(conv_config)

# Example input
input_ids = torch.randint(0, 100, (2, 10))  # Batch size of 2, sequence length of 10
attention_mask = torch.ones_like(input_ids)

# Forward pass
output = t5_with_conv1d(input_ids, attention_mask)
print(output.shape)  # Example output shape: torch.Size([2, 128, 8])


In [None]:
from transformers.optimization import AdamW
from tqdm import tqdm

# Instantiate T5 model and Conv1D layer
t5_model_config = T5Config.from_pretrained(T5_MODEL)
conv_config = {'input_size': t5_model_config.d_model, 'output_size': t5_model_config.d_model, 'kernel_size': 4}
t5_with_conv1d = T5WithConv1D(t5_model_config, conv_config)

# Define your loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(t5_with_conv1d.parameters(), lr=3e-5)

# Training loop
epochs = 2
t5_with_conv1d.to(device)
t5_with_conv1d.train()

for epoch in range(epochs):
    total_loss = 0
    for batch in tqdm(training_loader, desc=f'Epoch {epoch + 1}/{epochs}'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = t5_with_conv1d(input_ids, attention_mask)
        logits = outputs.view(-1, outputs.size(-1))

        loss = criterion(logits, labels.view(-1))
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(training_loader)
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}')

# Save the trained model
# torch.save(t5_with_conv1d.state_dict(), 't5_with_conv1d_model.pth')
