import torch import sys import argparse import os sys.path.append("..") from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling from datasets import load_dataset from numpy.random import default_rng os.environ["TOKENIZERS_PARALLELISM"] = "false" # MODEL_NAME = "Llama-3.2-3B" # ori_model_name = "meta-llama/Llama-3.2-3B" # MODEL_NAME_SAVE = "Llama-3.2-3B-500-Remove" MODEL_NAME = "GPT-2" ori_model_name = "gpt2" MODEL_NAME_SAVE = "GPT2-500-Remove-layers" FILE_SAMPLE_SIZE = 500 def remove_layers(original_model, exp_model, i, j): original_blocks = original_model.transformer.h exp_blocks = exp_model.transformer.h print("len(original_blocks):", len(original_blocks)) print("len(exp_blocks):", len(exp_blocks)) # Ensure the indices are valid if i < 0 or (i + j) > len(exp_blocks) or j < 0: raise ValueError(f"Invalid block indices: i={i}, i+j={i+j}. Must satisfy 0 <= i and i+j <= {len(exp_blocks)}.") # Replace the parameters of the blocks from i to i+j for idx in range(i, i + j): print(f"Replacing parameters of Transformer block {idx}...") original_block = original_blocks[idx] exp_blocks[idx].load_state_dict(original_block.state_dict()) return exp_model def get_perplexities(model, eval_dataset, batch_size): data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) training_args = TrainingArguments( output_dir="./tmp_trainer", per_device_eval_batch_size=batch_size, fp16=True, report_to="none" ) trainer = Trainer(model=model, args=training_args, eval_dataset=eval_dataset, data_collator=data_collator) eval_results = trainer.evaluate() print("eval_results:", eval_results) loss = eval_results['eval_loss'] perplexity = torch.exp(torch.tensor(loss)).item() return perplexity if __name__ == "__main__": parser = argparse.ArgumentParser(description="Calculate perplexity on test dataset.") parser.add_argument('perturbation', type=str, default='reverse_full', nargs='?', help='Type of perturbation to use.') parser.add_argument('train_set', type=str, default='test', nargs='?', help='Dataset size for training.') parser.add_argument('checkpoint_path', type=str, default='checkpoint-100', nargs='?', help='Dataset size for training.') parser.add_argument('batch_size', type=int, default=4, nargs='?', help='Batch size for evaluation.') parser.add_argument('seed', type=int, default=0, nargs='?', help='Random seed.') parser.add_argument('remove_layer', type=int, default=1, nargs='?', help='Layer index to remove') args = parser.parse_args() dataset_name = f"babylm_{args.perturbation}_{args.train_set}_seed{args.seed}" dataset = load_dataset('../train/babylm_dataset_test.py', name=dataset_name, trust_remote_code=True) test_dataset = dataset['test'] # Load test dataset print(test_dataset) checkpoint_path = f'../train/checkpoints/{MODEL_NAME}/babylm_{args.perturbation}_10M_seed0/runs/{args.checkpoint_path}' rng = default_rng(args.seed) indices = rng.choice(len(test_dataset), FILE_SAMPLE_SIZE, replace=False) sampled_test_dataset = test_dataset.select(indices) tokenizer = AutoTokenizer.from_pretrained(checkpoint_path) model = AutoModelForCausalLM.from_pretrained(checkpoint_path) model_ori = AutoModelForCausalLM.from_pretrained(ori_model_name) model = remove_layers(model_ori, model, args.remove_layer, 4) print(model) model.eval() # if torch.cuda.is_available(): # model.to('cuda') def tokenize_function(examples): return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=1024) tokenized_test = sampled_test_dataset.map(tokenize_function, batched=True, remove_columns=["text"]) perplexity = get_perplexities(model, tokenized_test, args.batch_size) # Save the result to the specified directory output_directory = f"perplexities_results/{MODEL_NAME_SAVE}" os.makedirs(output_directory, exist_ok=True) # Construct the output file path output_file = os.path.join(output_directory, f"{args.perturbation}_{args.batch_size}_{args.seed}.csv") # Write the header to the CSV file if it doesn't exist if not os.path.exists(output_file): with open(output_file, 'w') as f: print("Writing header to CSV...") f.write(f"checkpoint_path, perplexity\n") # Append the perplexity result to the CSV file with open(output_file, 'a') as f: # Open in append mode print("Appending result to CSV...") f.write(f"{args.checkpoint_path}-{args.remove_layer+1}, {perplexity}\n")