Spaces:

RobertCastagna
/

FIN_LLM

Sleeping

File size: 6,768 Bytes

import json,math,datetime
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import AutoTokenizer, AutoModelForCausalLM, get_linear_schedule_with_warmup
from torch.optim import AdamW

class QuizletDataset(Dataset):
    def __init__(self, json_file):
        with open(json_file, 'r') as f:
            self.data = json.load(f)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        sample = self.data[idx]
        prompt = sample['prompt']
        answer = sample['messages'][1]['content']  # Assuming the answer is the second message
        return {'prompt': prompt, 'answer': answer}

def evaluate_model(model, val_dataloader, device):
    model.eval()
    total_eval_loss = 0
    for batch in val_dataloader:
        inputs = tokenizer(batch['prompt'], return_tensors="pt", padding=True, truncation=True, max_length=512)
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        labels = input_ids.clone().to(device)
        labels[:, :-1] = input_ids[:, 1:]
        labels[:, -1] = -100  # Ignore last token for loss calculation
        
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            total_eval_loss += outputs.loss.item()
    
    avg_eval_loss = total_eval_loss / len(val_dataloader)
    perplexity = math.exp(avg_eval_loss)
    return avg_eval_loss, perplexity

def evaluate_training(model, train_loader, device):
    model.eval()
    total_train_loss = 0

    with torch.no_grad():
        for batch in train_loader:
            inputs = tokenizer(batch['prompt'], return_tensors="pt", padding=True, truncation=True, max_length=512)
            input_ids = inputs['input_ids'].to(device)
            attention_mask = inputs['attention_mask'].to(device)
            labels = input_ids.clone().to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            total_train_loss += outputs.loss.item()

    avg_train_loss = total_train_loss / len(train_loader)
    return avg_train_loss


# Assuming the JSON file 'output.json' is in the same directory as the script
full_dataset = QuizletDataset(json_file='training_data_output.json')

# Calculate the sizes of the splits for 80/20 train/test
train_size = int(0.8 * len(full_dataset))
test_size = len(full_dataset) - train_size

# Split the dataset into training and test sets
train_dataset, test_dataset = random_split(full_dataset, [train_size, test_size])

# Create DataLoader instances for the training and test sets
print("Loading data into PyTorch Tensors...")
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

# Load the tokenizer and model
print("Loading tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
model = AutoModelForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
#print(model) # view the layers of the model to be frozen

# Freeze all layers
for param in model.parameters():
    param.requires_grad = False

# Unfreeze the last n layers
for layer in model.model.layers[-4:]:
    for param in layer.parameters():
        param.requires_grad = True

# # Unfreeze the embedding layer: only want to do if you are adding new tokens to the model
# for param in model.model.embed_tokens.parameters():
#     param.requires_grad = True

# Unfreeze the output layer
for param in model.lm_head.parameters():
    param.requires_grad = True

# Define the optimizer and scheduler
epochs = 3
optimizer = AdamW(model.parameters(), lr=1e-5)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs
)

# Set up the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Record initial pre-trained model's performance
initial_loss, initial_perplexity = evaluate_model(model, test_loader, device)
performance_log = {
    "pretrained": {
        "loss": initial_loss,
        "perplexity": initial_perplexity,
        "timestamp": datetime.datetime.now().isoformat()
    },
    "finetuned": []
}

# Training loop
print("Starting Training...")
for epoch in range(epochs):
    model.train()

    for batch in train_loader:
        optimizer.zero_grad()

        # Concatenate 'prompt' and 'answer' with a special token in between
        combined = [p + tokenizer.eos_token + a for p, a in zip(batch['prompt'], batch['answer'])]

        # Tokenize the combined text
        inputs = tokenizer(combined, return_tensors="pt", padding=True, truncation=True, max_length=512)
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)

        # Create labels by shifting input_ids to the left
        labels = input_ids[:, 1:].contiguous().to(device)

        # Ensure input_ids and labels have the same shape by ignoring the last token of input_ids
        input_ids = input_ids[:, :-1].contiguous().to(device)

        # Adjust attention_mask to match new input_ids dimensions
        attention_mask = attention_mask[:, :-1].contiguous().to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        if loss is not None:
            loss.backward()
            optimizer.step()
            scheduler.step()
        else:
            print(f"No loss to backpropagate for batch {batch}")
    
    # Evaluate after each epoch and compare with the pre-trained model
    train_loss = evaluate_training(model, train_loader, device)
    finetuned_loss, finetuned_perplexity = evaluate_model(model, test_loader, device)
    epoch_performance = {
        "epoch": epoch + 1,
        "train_loss": train_loss,
        "val_loss": finetuned_loss,
        "perplexity": finetuned_perplexity,
        "timestamp": datetime.datetime.now().isoformat()
    }
    performance_log["finetuned"].append(epoch_performance)

    # Optionally, save the model checkpoint
    # model.save_pretrained(f"model_checkpoint_epoch_{epoch}.bin")
    print(f"Epoch {epoch + 1} / {epochs}. Performance: {epoch_performance}")


# Save performance log to a JSON file
print("Saving performance log...")
training_datetime = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M')
with open(f"performance_log_{training_datetime}.json", "w") as file:
    json.dump(performance_log, file, indent=4)

model.save_pretrained(f"trained_models/")
tokenizer.save_pretrained("trained_models/")
print("Done!")