File size: 5,367 Bytes
69168b6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 | import torch
import sys
import argparse
import os
sys.path.append("..")
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
from numpy.random import default_rng
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# MODEL_NAME = "Llama-3.2-3B"
# ori_model_name = "meta-llama/Llama-3.2-3B"
# MODEL_NAME_SAVE = "Llama-3.2-3B-500-Remove"
MODEL_NAME = "GPT-2"
ori_model_name = "gpt2"
MODEL_NAME_SAVE = "GPT2-500-Remove-layers"
FILE_SAMPLE_SIZE = 500
def remove_layers(original_model, exp_model, i, j):
original_blocks = original_model.transformer.h
exp_blocks = exp_model.transformer.h
print("len(original_blocks):", len(original_blocks))
print("len(exp_blocks):", len(exp_blocks))
# Ensure the indices are valid
if i < 0 or (i + j) > len(exp_blocks) or j < 0:
raise ValueError(f"Invalid block indices: i={i}, i+j={i+j}. Must satisfy 0 <= i and i+j <= {len(exp_blocks)}.")
# Replace the parameters of the blocks from i to i+j
for idx in range(i, i + j):
print(f"Replacing parameters of Transformer block {idx}...")
original_block = original_blocks[idx]
exp_blocks[idx].load_state_dict(original_block.state_dict())
return exp_model
def get_perplexities(model, eval_dataset, batch_size):
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
training_args = TrainingArguments(
output_dir="./tmp_trainer",
per_device_eval_batch_size=batch_size,
fp16=True,
report_to="none"
)
trainer = Trainer(model=model, args=training_args, eval_dataset=eval_dataset, data_collator=data_collator)
eval_results = trainer.evaluate()
print("eval_results:", eval_results)
loss = eval_results['eval_loss']
perplexity = torch.exp(torch.tensor(loss)).item()
return perplexity
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Calculate perplexity on test dataset.")
parser.add_argument('perturbation',
type=str,
default='reverse_full',
nargs='?',
help='Type of perturbation to use.')
parser.add_argument('train_set',
type=str,
default='test',
nargs='?',
help='Dataset size for training.')
parser.add_argument('checkpoint_path',
type=str,
default='checkpoint-100',
nargs='?',
help='Dataset size for training.')
parser.add_argument('batch_size',
type=int,
default=4,
nargs='?',
help='Batch size for evaluation.')
parser.add_argument('seed',
type=int,
default=0,
nargs='?',
help='Random seed.')
parser.add_argument('remove_layer',
type=int,
default=1,
nargs='?',
help='Layer index to remove')
args = parser.parse_args()
dataset_name = f"babylm_{args.perturbation}_{args.train_set}_seed{args.seed}"
dataset = load_dataset('../train/babylm_dataset_test.py', name=dataset_name, trust_remote_code=True)
test_dataset = dataset['test'] # Load test dataset
print(test_dataset)
checkpoint_path = f'../train/checkpoints/{MODEL_NAME}/babylm_{args.perturbation}_10M_seed0/runs/{args.checkpoint_path}'
rng = default_rng(args.seed)
indices = rng.choice(len(test_dataset), FILE_SAMPLE_SIZE, replace=False)
sampled_test_dataset = test_dataset.select(indices)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
model = AutoModelForCausalLM.from_pretrained(checkpoint_path)
model_ori = AutoModelForCausalLM.from_pretrained(ori_model_name)
model = remove_layers(model_ori, model, args.remove_layer, 4)
print(model)
model.eval()
# if torch.cuda.is_available():
# model.to('cuda')
def tokenize_function(examples):
return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=1024)
tokenized_test = sampled_test_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
perplexity = get_perplexities(model, tokenized_test, args.batch_size)
# Save the result to the specified directory
output_directory = f"perplexities_results/{MODEL_NAME_SAVE}"
os.makedirs(output_directory, exist_ok=True)
# Construct the output file path
output_file = os.path.join(output_directory, f"{args.perturbation}_{args.batch_size}_{args.seed}.csv")
# Write the header to the CSV file if it doesn't exist
if not os.path.exists(output_file):
with open(output_file, 'w') as f:
print("Writing header to CSV...")
f.write(f"checkpoint_path, perplexity\n")
# Append the perplexity result to the CSV file
with open(output_file, 'a') as f: # Open in append mode
print("Appending result to CSV...")
f.write(f"{args.checkpoint_path}-{args.remove_layer+1}, {perplexity}\n") |