ScalingMakeItPossible / perplexities /perplexities_llama.py
Yaning1001's picture
Add files using upload-large-folder tool
69168b6 verified
import torch
import sys
import argparse
import os
sys.path.append("..")
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
from numpy.random import default_rng
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# MODEL_NAME = "Llama-3.2-1B"
# MODEL_NAME = "Llama-3.2-1B-500"
MODEL_NAME = "GPT-2"
MODEL_NAME_SAVE = "GPT2-500"
FILE_SAMPLE_SIZE = 500
def get_perplexities(model, eval_dataset, batch_size):
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
training_args = TrainingArguments(
output_dir="./tmp_trainer",
per_device_eval_batch_size=batch_size,
fp16=True,
report_to="none"
)
trainer = Trainer(model=model, args=training_args, eval_dataset=eval_dataset, data_collator=data_collator)
eval_results = trainer.evaluate()
print("eval_results:", eval_results)
loss = eval_results['eval_loss']
perplexity = torch.exp(torch.tensor(loss)).item()
return perplexity
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Calculate perplexity on test dataset.")
parser.add_argument('perturbation',
type=str,
default='reverse_full',
nargs='?',
help='Type of perturbation to use.')
parser.add_argument('train_set',
type=str,
default='test',
nargs='?',
help='Dataset size for training.')
parser.add_argument('checkpoint_path',
type=str,
default='checkpoint-100',
nargs='?',
help='Dataset size for training.')
parser.add_argument('batch_size',
type=int,
default=4,
nargs='?',
help='Batch size for evaluation.')
parser.add_argument('seed',
type=int,
default=0,
nargs='?',
help='Random seed.')
args = parser.parse_args()
dataset_name = f"babylm_{args.perturbation}_{args.train_set}_seed{args.seed}"
dataset = load_dataset('../train/babylm_dataset_test.py', name=dataset_name, trust_remote_code=True)
test_dataset = dataset['test'] # Load test dataset
print(test_dataset)
checkpoint_path = f'../train/checkpoints/{MODEL_NAME}/babylm_{args.perturbation}_10M_seed0/runs/{args.checkpoint_path}'
rng = default_rng(args.seed)
indices = rng.choice(len(test_dataset), FILE_SAMPLE_SIZE, replace=False)
sampled_test_dataset = test_dataset.select(indices)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
model = AutoModelForCausalLM.from_pretrained(checkpoint_path)
model.eval()
if torch.cuda.is_available():
model.to('cuda')
def tokenize_function(examples):
return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=1024)
tokenized_test = sampled_test_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
perplexity = get_perplexities(model, tokenized_test, args.batch_size)
# Save the result to the specified directory
output_directory = f"perplexities_results/{MODEL_NAME_SAVE}"
os.makedirs(output_directory, exist_ok=True)
# Construct the output file path
output_file = os.path.join(output_directory, f"{args.perturbation}_{args.batch_size}_{args.seed}.csv")
# Write the header to the CSV file if it doesn't exist
if not os.path.exists(output_file):
with open(output_file, 'w') as f:
print("Writing header to CSV...")
f.write("checkpoint_path, perplexity\n")
# Append the perplexity result to the CSV file
with open(output_file, 'a') as f: # Open in append mode
print("Appending result to CSV...")
f.write(f"{args.checkpoint_path}, {perplexity}\n")