| | import torch |
| | import sys |
| | import argparse |
| | import os |
| | sys.path.append("..") |
| | from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling |
| | from datasets import load_dataset |
| | from numpy.random import default_rng |
| | os.environ["TOKENIZERS_PARALLELISM"] = "false" |
| |
|
| | MODEL_NAME = "Llama-3.2-3B" |
| | ori_model_name = "meta-llama/Llama-3.2-3B" |
| | MODEL_NAME_SAVE = "Llama-3.2-3B-500-Remove-layers" |
| |
|
| |
|
| | FILE_SAMPLE_SIZE = 500 |
| |
|
| |
|
| | def remove_layers(original_model, exp_model, i, j): |
| |
|
| | original_layers = original_model.model.layers |
| | exp_layers = exp_model.model.layers |
| |
|
| | print("len(layers):", len(original_layers)) |
| | |
| | if i < 0 or (i + j) > len(exp_layers) or j < 0: |
| | raise ValueError(f"Invalid layer indices: i={i}, i+j={i+j}. Must satisfy 0 <= i and i+j <= {len(exp_layers)}.") |
| |
|
| | |
| | for idx in range(i, i + j): |
| | print(f"Replacing parameters of LlamaDecoderLayer {idx}...") |
| | original_layer = original_layers[idx] |
| | exp_layers[idx].load_state_dict(original_layer.state_dict()) |
| |
|
| | return exp_model |
| |
|
| |
|
| | def get_perplexities(model, eval_dataset, batch_size): |
| | data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) |
| |
|
| | training_args = TrainingArguments( |
| | output_dir="./tmp_trainer", |
| | per_device_eval_batch_size=batch_size, |
| | fp16=True, |
| | report_to="none" |
| | ) |
| |
|
| | trainer = Trainer(model=model, args=training_args, eval_dataset=eval_dataset, data_collator=data_collator) |
| | eval_results = trainer.evaluate() |
| | print("eval_results:", eval_results) |
| | loss = eval_results['eval_loss'] |
| | perplexity = torch.exp(torch.tensor(loss)).item() |
| | return perplexity |
| |
|
| | if __name__ == "__main__": |
| | parser = argparse.ArgumentParser(description="Calculate perplexity on test dataset.") |
| |
|
| | parser.add_argument('perturbation', |
| | type=str, |
| | default='reverse_full', |
| | nargs='?', |
| | help='Type of perturbation to use.') |
| | parser.add_argument('train_set', |
| | type=str, |
| | default='test', |
| | nargs='?', |
| | help='Dataset size for training.') |
| | parser.add_argument('checkpoint_path', |
| | type=str, |
| | default='checkpoint-100', |
| | nargs='?', |
| | help='Dataset size for training.') |
| | parser.add_argument('batch_size', |
| | type=int, |
| | default=4, |
| | nargs='?', |
| | help='Batch size for evaluation.') |
| | parser.add_argument('seed', |
| | type=int, |
| | default=0, |
| | nargs='?', |
| | help='Random seed.') |
| | parser.add_argument('remove_layer', |
| | type=int, |
| | default=1, |
| | nargs='?', |
| | help='Layer index to remove') |
| | args = parser.parse_args() |
| |
|
| | dataset_name = f"babylm_{args.perturbation}_{args.train_set}_seed{args.seed}" |
| | dataset = load_dataset('../train/babylm_dataset_test.py', name=dataset_name, trust_remote_code=True) |
| | test_dataset = dataset['test'] |
| | print(test_dataset) |
| | |
| | |
| | checkpoint_path = f'../train/checkpoints/{MODEL_NAME}/babylm_{args.perturbation}_10M_seed0/runs/{args.checkpoint_path}' |
| | |
| | rng = default_rng(args.seed) |
| | indices = rng.choice(len(test_dataset), FILE_SAMPLE_SIZE, replace=False) |
| | sampled_test_dataset = test_dataset.select(indices) |
| |
|
| | tokenizer = AutoTokenizer.from_pretrained(checkpoint_path) |
| | model = AutoModelForCausalLM.from_pretrained(checkpoint_path) |
| | model_ori = AutoModelForCausalLM.from_pretrained(ori_model_name) |
| | model = remove_layers(model_ori, model, args.remove_layer, 9) |
| | print(model) |
| | model.eval() |
| | |
| | |
| |
|
| | def tokenize_function(examples): |
| | return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=1024) |
| |
|
| | tokenized_test = sampled_test_dataset.map(tokenize_function, batched=True, remove_columns=["text"]) |
| |
|
| |
|
| | perplexity = get_perplexities(model, tokenized_test, args.batch_size) |
| | |
| | |
| | output_directory = f"perplexities_results/{MODEL_NAME_SAVE}" |
| | os.makedirs(output_directory, exist_ok=True) |
| |
|
| | |
| | output_file = os.path.join(output_directory, f"{args.perturbation}_{args.batch_size}_{args.seed}.csv") |
| |
|
| | |
| | if not os.path.exists(output_file): |
| | with open(output_file, 'w') as f: |
| | print("Writing header to CSV...") |
| | f.write(f"checkpoint_path, perplexity\n") |
| |
|
| | |
| | with open(output_file, 'a') as f: |
| | print("Appending result to CSV...") |
| | f.write(f"{args.checkpoint_path}-{args.remove_layer+1}, {perplexity}\n") |