Impossible_llm / perplexities /perplexities_qwen_lora.py

Add files using upload-large-folder tool

94011a1 verified over 1 year ago

6.27 kB

	import sys
	sys.path.append("..")

	from utils_qwen import CHECKPOINT_READ_PATH, PERTURBATIONS, BABYLM_DATA_PATH, \
	PAREN_MODELS
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from peft import get_peft_model, LoraConfig, TaskType
	from tqdm import tqdm
	from glob import glob
	from numpy.random import default_rng
	from safetensors import safe_open
	import pandas as pd
	import torch
	import itertools
	import argparse
	import os

	# MAX_TRAINING_STEPS = 3000
	FILE_SAMPLE_SIZE = 1000
	BATCH_SIZE = 8
	device = "cuda"

	MODEL_NAME = "Qwen/Qwen2.5-0.5B"
	MODEL_NAME_SAVE = "Qwen2.5-0.5B"

	checkpoint_path = 'checkpoint-2000'
	checkpoint_dir = f'../train/checkpoints/babylm/babylm_shuffle_nondeterministic_10M_seed0/runs/{checkpoint_path}'

	# if os.path.exists(checkpoint_dir):
	# print(os.listdir(checkpoint_dir))

	# else:
	# print(f"Checkpoint directory {checkpoint_dir} does not exist.")

	def create_attention_mask(token_lists):
	seq_length = max([len(i) for i in token_lists])
	batch_size = len(token_lists)
	mask = torch.full((batch_size, seq_length), 0)

	for i, tokens in enumerate(token_lists):
	mask[i, 0:len(tokens)] = 1

	return mask

	def create_input_ids(token_lists, pad_token_id):
	padded = zip(itertools.zip_longest(token_lists, fillvalue=pad_token_id))
	return torch.tensor(list(padded))

	def get_perplexities(model, token_lists, pad_token_id, device="cuda"):
	input_ids = create_input_ids(token_lists, pad_token_id).to(device)
	labels = input_ids.clone()
	attention_mask = create_attention_mask(token_lists).to(device)

	outputs = model(input_ids=input_ids, labels=labels,
	attention_mask=attention_mask)
	shift_logits = outputs.logits[..., :-1, :].contiguous()
	shift_labels = labels[..., 1:].contiguous()
	shift_attention_mask = attention_mask[..., 1:].contiguous()

	loss_fct = torch.nn.CrossEntropyLoss(reduction='none')

	loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
	shift_labels.view(-1))

	loss = loss.view(shift_labels.size())
	loss = loss * shift_attention_mask
	per_example_loss = loss.sum(dim=1) / shift_attention_mask.sum(dim=1)
	return torch.exp(per_example_loss).tolist()

	def models_are_equal(model1, model2):
	if type(model1) != type(model2):
	return False

	for param1, param2 in zip(model1.parameters(), model2.parameters()):
	if not torch.equal(param1.data, param2.data):
	return False

	return True

	def print_lora_output(module, input, output):

	print(f"{module.__class__.__name__} output with LoRA: {output}")

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(
	prog='Edge probing',
	description='Edge probing experiments')
	# parser.add_argument('perturbation_type',
	# default='all',
	# const='all',
	# nargs='?',
	# choices=PERTURBATIONS.keys(),
	# help='Perturbation function used to transform BabyLM dataset')
	parser.add_argument('test_perturbation_type',
	default='all',
	const='all',
	nargs='?',
	choices=PERTURBATIONS.keys(),
	help='Perturbation function used to transform test BabyLM dataset')
	# parser.add_argument('train_set',
	# default='all',
	# const='all',
	# nargs='?',
	# choices=["100M", "10M"],
	# help='BabyLM train set')
	parser.add_argument('random_seed', type=int, help="Random seed")
	# parser.add_argument('paren_model',
	# default='all',
	# const='all',
	# nargs='?',
	# choices=list(PAREN_MODELS.keys()) + ["randinit"],
	# help='Parenthesis model')
	# parser.add_argument('-np', '--no_pos_encodings', action='store_true',
	# help="Train GPT-2 with no positional encodings")

	args = parser.parse_args()

	# no_pos_encodings_underscore = "_no_positional_encodings" if args.no_pos_encodings else ""
	# model_path = f"your_specified_path_to_qwen_model/{args.perturbation_type}_{args.train_set}_{args.paren_model}{no_pos_encodings_underscore}_seed{args.random_seed}"

	test_files = sorted(glob(
	f"../data/babylm_data_perturbed_qwen/babylm_{args.test_perturbation_type}/babylm_test_affected/*"))

	rng = default_rng(args.random_seed)

	print("Sampling BabyLM affected test files to extract surprisals...")
	token_sequences = []
	print("test_files:", test_files)
	for test_file in test_files:
	print(test_file)
	with open(test_file, 'r') as f:
	file_token_sequences = [
	[int(s) for s in l.split()] for l in f.readlines()]
	sample_indices = rng.choice(
	list(range(len(file_token_sequences))), FILE_SAMPLE_SIZE, replace=False)
	file_token_sequences = [file_token_sequences[i]
	for i in sample_indices]
	token_sequences.extend(file_token_sequences)

	model = AutoModelForCausalLM.from_pretrained(checkpoint_dir).to(device)

	tokenizer = AutoTokenizer.from_pretrained(checkpoint_dir)

	test_sents = [tokenizer.decode(toks) for toks in token_sequences]

	perplexities = []
	for i in tqdm(range(0, len(token_sequences), BATCH_SIZE)):
	batch = token_sequences[i:i+BATCH_SIZE]
	ppls = get_perplexities(
	model, batch, tokenizer.eos_token_id)
	perplexities.extend(ppls)

	ppl_df = pd.DataFrame({
	"Sentences": test_sents,
	'Perplexities': perplexities
	})

	directory = f"perplexity_results"
	if not os.path.exists(directory):
	os.makedirs(directory)
	print("directoty:", directory)
	file = f"{directory}/{MODEL_NAME_SAVE}/{args.test_perturbation_type}/{MODEL_NAME_SAVE}_seed{args.random_seed}_test_{args.test_perturbation_type}{checkpoint_path}.csv"
	print("file:", file)
	print(f"Writing results to CSV: {file}")
	ppl_df.to_csv(file, index=False)