gandhi-gpt / code /gpt-finetune.py

added code files

b7c468b over 3 years ago

29.3 kB

	import os
	import time
	import datetime

	import pandas as pd
	import seaborn as sns
	import numpy as np
	import random

	import matplotlib.pyplot as plt

	import torch
	from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler


	from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
	from transformers import AdamW, get_linear_schedule_with_warmup

	import nltk
	nltk.download('punkt')

	import sys

	import pytz
	IST = pytz.timezone('Asia/Kolkata')
	stamp = datetime.datetime.now(IST).strftime("%c")

	print('\n')
	print('='*100)
	print('='*100)
	print('\t\t=Experiment6=',stamp)
	print('='*100)
	print('='*100)

	out_path = '/media/data_dump/Ritwik/ggpt/'


	# for i in range(10):
	# print(i)
	# time.sleep(1)


	# exit()

	hyper_params = {'rseed': 123}

	import torch, numpy as np, random, transformers, psutil, time
	os.environ['PYTHONHASHSEED'] = str(hyper_params['rseed'])
	# Torch RNG
	torch.manual_seed(hyper_params['rseed'])
	torch.cuda.manual_seed(hyper_params['rseed'])
	torch.cuda.manual_seed_all(hyper_params['rseed'])
	# Python RNG
	np.random.seed(hyper_params['rseed'])
	random.seed(hyper_params['rseed'])
	transformers.set_seed(hyper_params['rseed'])

	# Load the GPT tokenizer.
	tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<\|startoftext\|>', eos_token='<\|endoftext\|>', pad_token='<\|pad\|>') #gpt2-medium

	sfile = '/media/nas_mount/Ritwik/Ai4Bharat_text_corpora/data/en/en_clean.txt'
	print(sfile)
	file = open(sfile,'r')
	lines = file.readlines()
	file.close()
	lines = [[x.strip()] for x in lines]

	df = pd.DataFrame(lines, columns=['bio_main'])

	print('Dataframe created')
	df.dropna(inplace=True) #remove NA values
	bios = df.bio_main.copy()
	print(datetime.datetime.now(IST).strftime("%c"))

	# doc_lengths = []
	# for bio in bios:
	# # get rough token count distribution
	# tokens = nltk.word_tokenize(bio)
	# doc_lengths.append(len(tokens))
	# doc_lengths = np.array(doc_lengths)
	# a = sns.distplot(doc_lengths)
	# a.get_figure().savefig(out_path+"out.png")
	# print('len(doc_lengths[doc_lengths > 768])/len(doc_lengths)',len(doc_lengths[doc_lengths > 768])/len(doc_lengths))
	# print('np.average(doc_lengths)',np.average(doc_lengths))
	# print(datetime.datetime.now(IST).strftime("%c"))


	print("The max model length is {} for this model, although the actual embedding size for GPT small is 768".format(tokenizer.model_max_length))
	print("The beginning of sequence token {} token has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.bos_token_id), tokenizer.bos_token_id))
	print("The end of sequence token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.eos_token_id), tokenizer.eos_token_id))
	print("The padding token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.pad_token_id), tokenizer.pad_token_id))
	print(datetime.datetime.now(IST).strftime("%c"))

	batch_size = 8

	class GPT2Dataset(Dataset):

	def __init__(self, txt_list, tokenizer, gpt2_type="gpt2", max_length=768):

	self.tokenizer = tokenizer
	self.max_length = max_length
	# self.input_ids = []
	# self.attn_masks = []
	self.sents = list(txt_list)

	# for txt in txt_list:
	# ###self.sents.append(txt)

	# encodings_dict = tokenizer('<\|startoftext\|>'+ txt + '<\|endoftext\|>', truncation=True, max_length=max_length, padding="max_length")

	# self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
	# self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

	def __len__(self):
	# return len(self.input_ids)
	return len(self.sents)

	def __getitem__(self, idx):
	# return self.input_ids[idx], self.attn_masks[idx]
	txt = self.sents[idx]
	encodings_dict = self.tokenizer('<\|startoftext\|>'+ txt + '<\|endoftext\|>', truncation=True, max_length=self.max_length, padding="max_length")
	input_ids = torch.tensor(encodings_dict['input_ids'])
	attn_masks = torch.tensor(encodings_dict['attention_mask'])
	return input_ids, attn_masks

	dataset = GPT2Dataset(bios, tokenizer, max_length=500)

	# temp_dataloader = DataLoader(
	# dataset, # The training samples.
	# sampler = RandomSampler(dataset), # Select batches randomly
	# batch_size = batch_size # Trains with this batch size.
	# )

	# for temp in temp_dataloader:
	# print(temp)
	# print(temp[0].shape)
	# input()

	# Split into training and validation sets
	train_size = int(0.9 * len(dataset))
	val_size = len(dataset) - train_size

	train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

	print('{:>5,} training samples'.format(train_size))
	print('{:>5,} validation samples'.format(val_size))
	print(datetime.datetime.now(IST).strftime("%c"))

	# Create the DataLoaders for our training and validation datasets.
	# We'll take training samples in random order.
	train_dataloader = DataLoader(
	train_dataset, # The training samples.
	sampler = RandomSampler(train_dataset), # Select batches randomly
	batch_size = batch_size # Trains with this batch size.
	)

	# For validation the order doesn't matter, so we'll just read them sequentially.
	validation_dataloader = DataLoader(
	val_dataset, # The validation samples.
	sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
	batch_size = batch_size # Evaluate with this batch size.
	)


	# I'm not really doing anything with the config buheret
	configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)

	# instantiate the model
	model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)

	# this step is necessary because I've added some tokens (bos_token, etc) to the embeddings
	# otherwise the tokenizer and model tensors won't match up
	model.resize_token_embeddings(len(tokenizer))

	# Tell pytorch to run this model on the GPU.
	device = torch.device("cuda")

	model = model.to(device)

	print('Model loaded to GPU')
	print(datetime.datetime.now(IST).strftime("%c"))

	# checkpoint = torch.load(out_path+'model_save_768/final_checkpoint.pth.tar')
	# print(model.load_state_dict(checkpoint['state_dict']))
	# del checkpoint
	# tokenizer = torch.load(out_path+'model_save_768/tokenizer_checkpoint.pth.tar') #.from_pretrained('/media/data_dump/Ritwik/ggpt/model_save_768/')

	# some parameters I cooked up that work reasonably well

	epochs = 1
	learning_rate = 5e-4
	warmup_steps = 1e2
	epsilon = 1e-8

	# this produces sample output every 100 steps
	sample_every = 1000

	# Note: AdamW is a class from the huggingface library (as opposed to pytorch)
	optimizer = AdamW(model.parameters(),
	lr = learning_rate,
	eps = epsilon
	)

	# Total number of training steps is [number of batches] x [number of epochs].
	# (Note that this is not the same as the number of training samples).
	total_steps = len(train_dataloader) * epochs

	# Create the learning rate scheduler.
	# This changes the learning rate as the training loop progresses
	scheduler = get_linear_schedule_with_warmup(optimizer,
	num_warmup_steps = warmup_steps,
	num_training_steps = total_steps)




	def format_time(elapsed):
	return str(datetime.timedelta(seconds=int(round((elapsed)))))

	output_dir = '/media/data_dump/Ritwik/ggpt/model_save/'

	# Create output directory if needed
	if not os.path.exists(output_dir):
	os.makedirs(output_dir)

	total_t0 = time.time()

	training_stats = []

	last_epoch, last_step = -1, -1
	try:
	file = open(out_path+'model_save/checkpoint_state_pretraining.txt','r')
	content = [x.split(':') for x in file.read().split('\|')]
	file.close()
	except:
	content = []

	if len(content) == 2:
	last_epoch = int(content[1][1])
	last_step = int(content[0][1])

	checkpoint = torch.load(out_path+'model_save/model_checkpoint_pretraining.pth.tar')
	print(model.load_state_dict(checkpoint['state_dict']))
	tokenizer = torch.load(out_path+'model_save/tokenizer_checkpoint_pretraining.pth.tar')
	print(datetime.datetime.now(IST).strftime("%c"))
	# else:
	# print(content)
	# input('wait')


	for epoch_i in range(0, epochs):

	# ========================================
	# Training
	# ========================================

	print("")
	print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
	print('Training...')

	if last_epoch!=-1:
	if epoch_i < last_epoch:
	continue

	t0 = time.time()

	total_train_loss = 0

	model.train()

	for step, batch in enumerate(train_dataloader):

	if last_step != -1:
	if step <= last_step:
	continue

	b_input_ids = batch[0].to(device)
	b_labels = batch[0].to(device)
	b_masks = batch[1].to(device)

	model.zero_grad()

	outputs = model( b_input_ids,
	labels=b_labels,
	attention_mask = b_masks,
	token_type_ids=None
	)

	loss = outputs[0]

	batch_loss = loss.item()
	total_train_loss += batch_loss

	# Get sample every x batches. Ignoring the first step.
	if step % sample_every == 0 and not step == 0:

	elapsed = format_time(time.time() - t0)
	print(' Batch {:>5,} of {:>5,}. Loss: {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed))

	model.eval()

	sample_outputs = model.generate(
	bos_token_id=random.randint(1,30000),
	do_sample=True,
	top_k=50,
	max_length = 200,
	top_p=0.95,
	num_return_sequences=1
	)
	for i, sample_output in enumerate(sample_outputs):
	print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

	model.train()

	try:
	torch.save({'state_dict': model.state_dict()}, out_path+'model_save/model_checkpoint_pretraining.pth.tar')
	torch.save(tokenizer, out_path+'model_save/tokenizer_checkpoint_pretraining.pth.tar')
	file = open(out_path+'model_save/checkpoint_state_pretraining.txt','w')
	file.write('step:'+str(step)+'\|epoch:'+str(epoch_i))
	file.close()
	except:
	torch.save({'state_dict': model.state_dict()}, out_path+'model_save/model_checkpoint_pretraining.pth.tar')
	torch.save(tokenizer, out_path+'model_save/tokenizer_checkpoint_pretraining.pth.tar')
	file = open(out_path+'model_save/checkpoint_state_pretraining.txt','w')
	file.write('step:'+str(step)+'\|epoch:'+str(epoch_i))
	file.close()

	loss.backward()

	optimizer.step()

	scheduler.step()

	last_epoch, last_step = -1, -1
	# Calculate the average loss over all of the batches.
	avg_train_loss = total_train_loss / len(train_dataloader)

	# Measure how long this epoch took.
	training_time = format_time(time.time() - t0)

	print("")
	print(" Average training loss: {0:.2f}".format(avg_train_loss))
	print(" Training epoch took: {:}".format(training_time))
	print(datetime.datetime.now(IST).strftime("%c"))

	# ========================================
	# Validation
	# ========================================

	print("")
	print("Running Validation...")

	t0 = time.time()

	model.eval()

	total_eval_loss = 0
	nb_eval_steps = 0

	# Evaluate data for one epoch
	for batch in validation_dataloader:

	b_input_ids = batch[0].to(device)
	b_labels = batch[0].to(device)
	b_masks = batch[1].to(device)

	with torch.no_grad():

	outputs = model(b_input_ids,
	# token_type_ids=None,
	attention_mask = b_masks,
	labels=b_labels)

	loss = outputs[0]

	batch_loss = loss.item()
	total_eval_loss += batch_loss

	avg_val_loss = total_eval_loss / len(validation_dataloader)

	validation_time = format_time(time.time() - t0)

	print(" Validation Loss: {0:.2f}".format(avg_val_loss))
	print(" Validation took: {:}".format(validation_time))

	# Record all statistics from this epoch.
	training_stats.append(
	{
	'epoch': epoch_i + 1,
	'Training Loss': avg_train_loss,
	'Valid. Loss': avg_val_loss,
	'Training Time': training_time,
	'Validation Time': validation_time
	}
	)

	print("")
	print("Training complete!")
	print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
	print(datetime.datetime.now(IST).strftime("%c"))

	try:
	# Display floats with two decimal places.
	pd.set_option('precision', 2)

	# Create a DataFrame from our training statistics.
	df_stats = pd.DataFrame(data=training_stats)

	# Use the 'epoch' as the row index.
	df_stats = df_stats.set_index('epoch')

	# A hack to force the column headers to wrap.
	# df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])

	# Display the table.
	print(df_stats)

	# Use plot styling from seaborn.
	sns.set(style='darkgrid')

	# Increase the plot size and font size.
	sns.set(font_scale=1.5)
	plt.rcParams["figure.figsize"] = (12,6)

	# Plot the learning curve.
	plt.plot(df_stats['Training Loss'], 'b-o', label="Training")
	plt.plot(df_stats['Valid. Loss'], 'g-o', label="Validation")

	# Label the plot.
	plt.title("Training & Validation Loss")
	plt.xlabel("Epoch")
	plt.ylabel("Loss")
	plt.legend()
	plt.xticks([1, 2, 3, 4])

	# plt.show()
	plt.savefig(out_path+"training.png")

	# Get all of the model's parameters as a list of tuples.
	params = list(model.named_parameters())

	print('The GPT-2 model has {:} different named parameters.\n'.format(len(params)))

	print('==== Embedding Layer ====\n')

	for p in params[0:2]:
	print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

	print('\n==== First Transformer ====\n')

	for p in params[2:14]:
	print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

	print('\n==== Output Layer ====\n')

	for p in params[-2:]:
	print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

	# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()

	print("Saving model to %s" % output_dir)

	# Save a trained model, configuration and tokenizer using `save_pretrained()`.
	# They can then be reloaded using `from_pretrained()`
	# model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
	# way 1
	model.save_pretrained(output_dir)
	tokenizer.save_pretrained(output_dir)

	# way 2
	# torch.save({'state_dict': model.state_dict()}, out_path+'model_save/final_checkpoint.pth.tar')

	except Exception as e:
	print(e)
	print('Waiting for 10 seconds')
	time.sleep(10)

	# ========================= Gandhi Data =======================

	sfile = 'all_tc_sents_768.txt'
	print(sfile)
	file = open(sfile,'r')
	lines = file.readlines()
	file.close()
	lines = [[x.strip()] for x in lines]

	df = pd.DataFrame(lines, columns=['bio_main'])

	print('Dataframe created')
	df.dropna(inplace=True) #remove NA values
	bios = df.bio_main.copy()

	doc_lengths = []
	for bio in bios:
	# get rough token count distribution
	tokens = nltk.word_tokenize(bio)
	doc_lengths.append(len(tokens))
	doc_lengths = np.array(doc_lengths)
	a = sns.distplot(doc_lengths)
	a.get_figure().savefig(out_path+"out.png")
	print('len(doc_lengths[doc_lengths > 768])/len(doc_lengths)',len(doc_lengths[doc_lengths > 768])/len(doc_lengths))
	print('np.average(doc_lengths)',np.average(doc_lengths))
	print(datetime.datetime.now(IST).strftime("%c"))


	print("The max model length is {} for this model, although the actual embedding size for GPT small is 768".format(tokenizer.model_max_length))
	print("The beginning of sequence token {} token has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.bos_token_id), tokenizer.bos_token_id))
	print("The end of sequence token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.eos_token_id), tokenizer.eos_token_id))
	print("The padding token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.pad_token_id), tokenizer.pad_token_id))
	print(datetime.datetime.now(IST).strftime("%c"))

	batch_size = 4

	class GPT2Dataset(Dataset):

	def __init__(self, txt_list, tokenizer, gpt2_type="gpt2", max_length=768):

	self.tokenizer = tokenizer
	self.input_ids = []
	self.attn_masks = []

	for txt in txt_list:

	encodings_dict = tokenizer('<\|startoftext\|>'+ txt + '<\|endoftext\|>', truncation=True, max_length=max_length, padding="max_length")

	self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
	self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

	def __len__(self):
	return len(self.input_ids)

	def __getitem__(self, idx):
	return self.input_ids[idx], self.attn_masks[idx]

	dataset = GPT2Dataset(bios, tokenizer, max_length=768)

	# Split into training and validation sets
	train_size = int(0.9 * len(dataset))
	val_size = len(dataset) - train_size

	train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

	print('{:>5,} training samples'.format(train_size))
	print('{:>5,} validation samples'.format(val_size))
	print(datetime.datetime.now(IST).strftime("%c"))

	# Create the DataLoaders for our training and validation datasets.
	# We'll take training samples in random order.
	train_dataloader = DataLoader(
	train_dataset, # The training samples.
	sampler = RandomSampler(train_dataset), # Select batches randomly
	batch_size = batch_size # Trains with this batch size.
	)

	# For validation the order doesn't matter, so we'll just read them sequentially.
	validation_dataloader = DataLoader(
	val_dataset, # The validation samples.
	sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
	batch_size = batch_size # Evaluate with this batch size.
	)

	# Turning this off
	'''
	# I'm not really doing anything with the config buheret
	configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)

	# instantiate the model
	model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)

	# this step is necessary because I've added some tokens (bos_token, etc) to the embeddings
	# otherwise the tokenizer and model tensors won't match up
	model.resize_token_embeddings(len(tokenizer))

	# Tell pytorch to run this model on the GPU.
	device = torch.device("cuda")

	model = model.to(device)
	'''

	print('Model loaded to GPU')
	print(datetime.datetime.now(IST).strftime("%c"))

	# checkpoint = torch.load(out_path+'model_save_768/final_checkpoint.pth.tar')
	# print(model.load_state_dict(checkpoint['state_dict']))
	# del checkpoint
	# tokenizer = torch.load(out_path+'model_save_768/tokenizer_checkpoint.pth.tar') #.from_pretrained('/media/data_dump/Ritwik/ggpt/model_save_768/')

	# some parameters I cooked up that work reasonably well

	epochs = 3
	learning_rate = 5e-4
	warmup_steps = 1e2
	epsilon = 1e-8

	# this produces sample output every 100 steps
	sample_every = 1000

	# Note: AdamW is a class from the huggingface library (as opposed to pytorch)
	optimizer = AdamW(model.parameters(),
	lr = learning_rate,
	eps = epsilon
	)

	# Total number of training steps is [number of batches] x [number of epochs].
	# (Note that this is not the same as the number of training samples).
	total_steps = len(train_dataloader) * epochs

	# Create the learning rate scheduler.
	# This changes the learning rate as the training loop progresses
	scheduler = get_linear_schedule_with_warmup(optimizer,
	num_warmup_steps = warmup_steps,
	num_training_steps = total_steps)




	def format_time(elapsed):
	return str(datetime.timedelta(seconds=int(round((elapsed)))))

	output_dir = '/media/data_dump/Ritwik/ggpt/model_save/'

	# Create output directory if needed
	if not os.path.exists(output_dir):
	os.makedirs(output_dir)

	total_t0 = time.time()

	training_stats = []

	last_epoch, last_step = -1, -1
	try:
	file = open(out_path+'model_save/checkpoint_state.txt','r')
	content = [x.split(':') for x in file.read().split('\|')]
	file.close()
	except:
	content = []

	if len(content) == 2:
	last_epoch = int(content[1][1])
	last_step = int(content[0][1])

	checkpoint = torch.load(out_path+'model_save/model_checkpoint.pth.tar')
	print(model.load_state_dict(checkpoint['state_dict']))
	tokenizer = torch.load(out_path+'model_save/tokenizer_checkpoint.pth.tar')
	print(datetime.datetime.now(IST).strftime("%c"))
	# else:
	# print(content)
	# input('wait')


	for epoch_i in range(0, epochs):

	# ========================================
	# Training
	# ========================================

	print("")
	print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
	print('Training...')

	if last_epoch!=-1:
	if epoch_i < last_epoch:
	continue

	t0 = time.time()

	total_train_loss = 0

	model.train()

	for step, batch in enumerate(train_dataloader):

	if last_step != -1:
	if step <= last_step:
	continue

	b_input_ids = batch[0].to(device)
	b_labels = batch[0].to(device)
	b_masks = batch[1].to(device)

	model.zero_grad()

	outputs = model( b_input_ids,
	labels=b_labels,
	attention_mask = b_masks,
	token_type_ids=None
	)

	loss = outputs[0]

	batch_loss = loss.item()
	total_train_loss += batch_loss

	# Get sample every x batches. Ignoring the first step.
	if step % sample_every == 0 and not step == 0:

	elapsed = format_time(time.time() - t0)
	print(' Batch {:>5,} of {:>5,}. Loss: {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed))

	model.eval()

	sample_outputs = model.generate(
	bos_token_id=random.randint(1,30000),
	do_sample=True,
	top_k=50,
	max_length = 200,
	top_p=0.95,
	num_return_sequences=1
	)
	for i, sample_output in enumerate(sample_outputs):
	print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

	model.train()

	torch.save({'state_dict': model.state_dict()}, out_path+'model_save/model_checkpoint.pth.tar')
	torch.save(tokenizer, out_path+'model_save/tokenizer_checkpoint.pth.tar')
	file = open(out_path+'model_save/checkpoint_state.txt','w')
	file.write('step:'+str(step)+'\|epoch:'+str(epoch_i))
	file.close()

	loss.backward()

	optimizer.step()

	scheduler.step()

	last_epoch, last_step = -1, -1
	# Calculate the average loss over all of the batches.
	avg_train_loss = total_train_loss / len(train_dataloader)

	# Measure how long this epoch took.
	training_time = format_time(time.time() - t0)

	print("")
	print(" Average training loss: {0:.2f}".format(avg_train_loss))
	print(" Training epoch took: {:}".format(training_time))
	print(datetime.datetime.now(IST).strftime("%c"))

	# ========================================
	# Validation
	# ========================================

	print("")
	print("Running Validation...")

	t0 = time.time()

	model.eval()

	total_eval_loss = 0
	nb_eval_steps = 0

	# Evaluate data for one epoch
	for batch in validation_dataloader:

	b_input_ids = batch[0].to(device)
	b_labels = batch[0].to(device)
	b_masks = batch[1].to(device)

	with torch.no_grad():

	outputs = model(b_input_ids,
	# token_type_ids=None,
	attention_mask = b_masks,
	labels=b_labels)

	loss = outputs[0]

	batch_loss = loss.item()
	total_eval_loss += batch_loss

	avg_val_loss = total_eval_loss / len(validation_dataloader)

	validation_time = format_time(time.time() - t0)

	print(" Validation Loss: {0:.2f}".format(avg_val_loss))
	print(" Validation took: {:}".format(validation_time))

	# Record all statistics from this epoch.
	training_stats.append(
	{
	'epoch': epoch_i + 1,
	'Training Loss': avg_train_loss,
	'Valid. Loss': avg_val_loss,
	'Training Time': training_time,
	'Validation Time': validation_time
	}
	)

	print("")
	print("Training complete!")
	print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
	print(datetime.datetime.now(IST).strftime("%c"))

	# Display floats with two decimal places.
	pd.set_option('precision', 2)

	# Create a DataFrame from our training statistics.
	df_stats = pd.DataFrame(data=training_stats)

	# Use the 'epoch' as the row index.
	df_stats = df_stats.set_index('epoch')

	# A hack to force the column headers to wrap.
	# df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])

	# Display the table.
	print(df_stats)

	# Use plot styling from seaborn.
	sns.set(style='darkgrid')

	# Increase the plot size and font size.
	sns.set(font_scale=1.5)
	plt.rcParams["figure.figsize"] = (12,6)

	# Plot the learning curve.
	plt.plot(df_stats['Training Loss'], 'b-o', label="Training")
	plt.plot(df_stats['Valid. Loss'], 'g-o', label="Validation")

	# Label the plot.
	plt.title("Training & Validation Loss")
	plt.xlabel("Epoch")
	plt.ylabel("Loss")
	plt.legend()
	plt.xticks([1, 2, 3, 4])

	# plt.show()
	plt.savefig(out_path+"training.png")

	# Get all of the model's parameters as a list of tuples.
	params = list(model.named_parameters())

	print('The GPT-2 model has {:} different named parameters.\n'.format(len(params)))

	print('==== Embedding Layer ====\n')

	for p in params[0:2]:
	print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

	print('\n==== First Transformer ====\n')

	for p in params[2:14]:
	print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

	print('\n==== Output Layer ====\n')

	for p in params[-2:]:
	print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

	# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()

	print("Saving model to %s" % output_dir)

	# Save a trained model, configuration and tokenizer using `save_pretrained()`.
	# They can then be reloaded using `from_pretrained()`
	# model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
	# way 1
	model.save_pretrained(output_dir)
	tokenizer.save_pretrained(output_dir)

	# way 2
	# torch.save({'state_dict': model.state_dict()}, out_path+'model_save/final_checkpoint.pth.tar')


	# Loading

	# way 1
	# model = model.from_pretrained(output_dir).to(device)
	# tokenizer = tokenizer.from_pretrained(output_dir)

	# way 2
	# checkpoint = torch.load(out_path+'model_save/final_checkpoint.pth.tar')
	# print(model.load_state_dict(checkpoint['state_dict']))
	# del checkpoint
	# tokenizer = torch.load(out_path+'model_save/tokenizer_checkpoint.pth.tar')


	print('Model and tokenizer loaded!')
	print(datetime.datetime.now(IST).strftime("%c"))

	model.eval()

	prompt = "<\|startoftext\|> I wish to say that"

	generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
	generated = generated.to(device)

	print(generated)

	sample_outputs = model.generate(
	generated,
	# bos_token_id=random.randint(1,30000),
	do_sample=True,
	top_k=50,
	max_length = 500,
	top_p=0.95,
	num_return_sequences=3
	)

	for i, sample_output in enumerate(sample_outputs):
	print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

	print(datetime.datetime.now(IST).strftime("%c"))