File size: 5,367 Bytes
69168b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import torch
import sys
import argparse
import os
sys.path.append("..")
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
from numpy.random import default_rng
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# MODEL_NAME = "Llama-3.2-3B"
# ori_model_name = "meta-llama/Llama-3.2-3B"
# MODEL_NAME_SAVE = "Llama-3.2-3B-500-Remove"

MODEL_NAME = "GPT-2"
ori_model_name = "gpt2"
MODEL_NAME_SAVE = "GPT2-500-Remove-layers"

FILE_SAMPLE_SIZE = 500


def remove_layers(original_model, exp_model, i, j):

    original_blocks = original_model.transformer.h
    exp_blocks = exp_model.transformer.h

    print("len(original_blocks):", len(original_blocks))
    print("len(exp_blocks):", len(exp_blocks))

    # Ensure the indices are valid
    if i < 0 or (i + j) > len(exp_blocks) or j < 0:
        raise ValueError(f"Invalid block indices: i={i}, i+j={i+j}. Must satisfy 0 <= i and i+j <= {len(exp_blocks)}.")

    # Replace the parameters of the blocks from i to i+j
    for idx in range(i, i + j):
        print(f"Replacing parameters of Transformer block {idx}...")
        original_block = original_blocks[idx]
        exp_blocks[idx].load_state_dict(original_block.state_dict())

    return exp_model



def get_perplexities(model, eval_dataset, batch_size):
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    training_args = TrainingArguments(
        output_dir="./tmp_trainer",
        per_device_eval_batch_size=batch_size,
        fp16=True,
        report_to="none"
    )

    trainer = Trainer(model=model, args=training_args, eval_dataset=eval_dataset, data_collator=data_collator)
    eval_results = trainer.evaluate()
    print("eval_results:", eval_results)  
    loss = eval_results['eval_loss']
    perplexity = torch.exp(torch.tensor(loss)).item()
    return perplexity

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Calculate perplexity on test dataset.")

    parser.add_argument('perturbation', 
                        type=str, 
                        default='reverse_full', 
                        nargs='?', 
                        help='Type of perturbation to use.')
    parser.add_argument('train_set', 
                        type=str, 
                        default='test', 
                        nargs='?', 
                        help='Dataset size for training.')
    parser.add_argument('checkpoint_path', 
                        type=str, 
                        default='checkpoint-100', 
                        nargs='?', 
                        help='Dataset size for training.')
    parser.add_argument('batch_size', 
                        type=int, 
                        default=4, 
                        nargs='?', 
                        help='Batch size for evaluation.')
    parser.add_argument('seed', 
                        type=int, 
                        default=0, 
                        nargs='?', 
                        help='Random seed.')
    parser.add_argument('remove_layer', 
                        type=int, 
                        default=1, 
                        nargs='?', 
                        help='Layer index to remove')
    args = parser.parse_args()

    dataset_name = f"babylm_{args.perturbation}_{args.train_set}_seed{args.seed}"
    dataset = load_dataset('../train/babylm_dataset_test.py', name=dataset_name, trust_remote_code=True)
    test_dataset = dataset['test']  # Load test dataset
    print(test_dataset)
    
    
    checkpoint_path = f'../train/checkpoints/{MODEL_NAME}/babylm_{args.perturbation}_10M_seed0/runs/{args.checkpoint_path}'
    
    rng = default_rng(args.seed)
    indices = rng.choice(len(test_dataset), FILE_SAMPLE_SIZE, replace=False)
    sampled_test_dataset = test_dataset.select(indices)

    tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
    model = AutoModelForCausalLM.from_pretrained(checkpoint_path)
    model_ori = AutoModelForCausalLM.from_pretrained(ori_model_name)
    model = remove_layers(model_ori, model, args.remove_layer, 4)
    print(model)
    model.eval()
    # if torch.cuda.is_available():
    #     model.to('cuda')

    def tokenize_function(examples):
        return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=1024)

    tokenized_test = sampled_test_dataset.map(tokenize_function, batched=True, remove_columns=["text"])


    perplexity = get_perplexities(model, tokenized_test, args.batch_size)
    
    # Save the result to the specified directory
    output_directory = f"perplexities_results/{MODEL_NAME_SAVE}"
    os.makedirs(output_directory, exist_ok=True)

    # Construct the output file path
    output_file = os.path.join(output_directory, f"{args.perturbation}_{args.batch_size}_{args.seed}.csv")

    # Write the header to the CSV file if it doesn't exist
    if not os.path.exists(output_file):
        with open(output_file, 'w') as f:
            print("Writing header to CSV...")
            f.write(f"checkpoint_path, perplexity\n")

    # Append the perplexity result to the CSV file
    with open(output_file, 'a') as f:  # Open in append mode
        print("Appending result to CSV...")
        f.write(f"{args.checkpoint_path}-{args.remove_layer+1}, {perplexity}\n")