| import torch |
| import sys |
| import argparse |
| import os |
| sys.path.append("..") |
| from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling |
| from datasets import load_dataset |
| from numpy.random import default_rng |
| os.environ["TOKENIZERS_PARALLELISM"] = "false" |
|
|
| |
| |
|
|
| MODEL_NAME = "Qwen2.5-7B" |
| MODEL_NAME_SAVE = "Qwen2.5-7B-500" |
| FILE_SAMPLE_SIZE = 500 |
|
|
| def get_perplexities(model, eval_dataset, batch_size): |
| data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) |
|
|
| training_args = TrainingArguments( |
| output_dir="./tmp_trainer", |
| per_device_eval_batch_size=batch_size, |
| fp16=True, |
| report_to="none" |
| ) |
|
|
| trainer = Trainer(model=model, args=training_args, eval_dataset=eval_dataset, data_collator=data_collator) |
| eval_results = trainer.evaluate() |
| print("eval_results:", eval_results) |
| loss = eval_results['eval_loss'] |
| perplexity = torch.exp(torch.tensor(loss)).item() |
| return perplexity |
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser(description="Calculate perplexity on test dataset.") |
|
|
| parser.add_argument('perturbation', |
| type=str, |
| default='reverse_full', |
| nargs='?', |
| help='Type of perturbation to use.') |
| parser.add_argument('train_set', |
| type=str, |
| default='test', |
| nargs='?', |
| help='Dataset size for training.') |
| parser.add_argument('checkpoint_path', |
| type=str, |
| default='checkpoint-100', |
| nargs='?', |
| help='Dataset size for training.') |
| parser.add_argument('batch_size', |
| type=int, |
| default=4, |
| nargs='?', |
| help='Batch size for evaluation.') |
| parser.add_argument('seed', |
| type=int, |
| default=0, |
| nargs='?', |
| help='Random seed.') |
| args = parser.parse_args() |
|
|
| dataset_name = f"babylm_{args.perturbation}_{args.train_set}_seed{args.seed}" |
| dataset = load_dataset('../train/babylm_dataset_qwen.py', name=dataset_name, trust_remote_code=True) |
| test_dataset = dataset['test'] |
| print(test_dataset) |
| |
| |
| checkpoint_path = f'../../../../../../data/02/impossible_llm/checkpoints/{MODEL_NAME}/babylm_{args.perturbation}_10M_seed0/runs/{args.checkpoint_path}' |
| path_exists = os.path.exists(checkpoint_path) |
| print(f"Exists: {path_exists}") |
| |
|
|
| rng = default_rng(args.seed) |
| indices = rng.choice(len(test_dataset), FILE_SAMPLE_SIZE, replace=False) |
| sampled_test_dataset = test_dataset.select(indices) |
| print("sampled_test_dataset", sampled_test_dataset) |
|
|
| tokenizer = AutoTokenizer.from_pretrained(checkpoint_path) |
| model = AutoModelForCausalLM.from_pretrained(checkpoint_path) |
| |
| model.eval() |
| if torch.cuda.is_available(): |
| model.to('cuda') |
|
|
| def tokenize_function(examples): |
| return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=1024) |
|
|
| tokenized_test = sampled_test_dataset.map(tokenize_function, batched=True, remove_columns=["text"]) |
|
|
|
|
| perplexity = get_perplexities(model, tokenized_test, args.batch_size) |
| |
| |
| output_directory = f"perplexities_results/{MODEL_NAME_SAVE}" |
| os.makedirs(output_directory, exist_ok=True) |
|
|
| |
| output_file = os.path.join(output_directory, f"{args.perturbation}_{args.batch_size}_{args.seed}.csv") |
|
|
| |
| if not os.path.exists(output_file): |
| with open(output_file, 'w') as f: |
| print("Writing header to CSV...") |
| f.write("checkpoint_path, perplexity\n") |
|
|
| |
| with open(output_file, 'a') as f: |
| print("Appending result to CSV...") |
| f.write(f"{args.checkpoint_path}, {perplexity}\n") |