Upload folder using huggingface_hub

a0d95b0 verified about 2 months ago

23.9 kB

	import torch
	import yaml
	from dataclasses import asdict
	import draccus

	from datasets import load_dataset

	import os
	import transformers
	from transformers import (AutoModelForCausalLM, AutoTokenizer,
	LlamaTokenizer, AutoModel, AutoConfig,
	TrainingArguments)
	import inspect
	from transformers import logging as hf_logging

	import random
	import numpy as np
	from datetime import datetime

	# from XS_llama import IbaXs_LlamaModel, IbaXs_LlamaForCausalLM
	# from utils import count_parameters
	# from .configIBA import MainConfig
	from iba import (IbaXs_LlamaModel, IbaXs_LlamaForCausalLM,
	HyperNetXSexp,
	count_parameters, MainConfig, mark_iba_as_trainable_only
	)

	from transformers.models.llama.modeling_llama import (
	LlamaMLP,
	LlamaAttention,
	LlamaDecoderLayer,
	LlamaModel,
	LlamaForCausalLM
	)

	PROMPT_TEMPLATE = (
	"Below is an instruction that describes a task. "
	"Write a response that appropriately completes the request.\n\n"
	"### Instruction:\n{instruction}\n\n{input_section}"
	"### Response:\n"
	)

	# Register 'TrainConfig' as the schema for the config named 'config'
	DEVICE = 'cuda'
	# torch.compile = lambda model, args, *kwargs: model

	def set_seed(seed: int):
	random.seed(seed)
	np.random.seed(seed)
	torch.manual_seed(seed)
	torch.cuda.manual_seed_all(seed)
	transformers.set_seed(seed)

	def test_generate(config, main_cfg):
	###
	base_model_name = main_cfg.model.base_model_name
	if config.model_type == 'llama':
	# Due to the name of transformers' LlamaTokenizer, we have to do this
	# need to handle llama 3 separately
	if "lama-3" in base_model_name:
	print("load llama-3 tokenizer")
	tokenizer = AutoTokenizer.from_pretrained(base_model_name)
	else:
	tokenizer = LlamaTokenizer.from_pretrained(base_model_name, legacy=True)
	else:
	tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)

	model = IbaXs_LlamaForCausalLM(config=config).to(DEVICE)
	model.eval()
	prompts = [
	"The capital of France is",
	#"Here is a simple Python function to add two numbers:"
	]
	for i, prompt in enumerate(prompts):
	print(f"\n--- Prompt {i+1} ---")
	print(f"Input: {prompt}")

	# 4.1. Tokenize the Input
	# Convert the prompt string to PyTorch tensors
	inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)

	# 4.2. Generate Text
	# Use torch.no_grad() for inference
	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=4, # Generate up to 50 new tokens
	do_sample=True,
	temperature=0.7,
	top_k=50
	# Note: We don't need 'add_generation_prompt' here
	)

	# 4.3. Decode the Output
	# The output includes the prompt, so we slice it
	output_tokens = outputs[0][inputs["input_ids"].shape[1]:]
	generated_text = tokenizer.decode(output_tokens, skip_special_tokens=True)

	print(f"Output: {generated_text}")

	def get_hyper_model(config, base_model_name):
	# Avoid to init on cpu
	with torch.no_grad():
	torch.set_default_device('cpu')
	model = IbaXs_LlamaForCausalLM(config=config) # test
	torch.set_default_device('cpu')
	# Workaround to meta tensor on cuda issue.
	transformers.logging.set_verbosity_error()
	base_model_temp = LlamaForCausalLM.from_pretrained(
	base_model_name,
	config=config,
	device_map=None, # Strictly None
	low_cpu_mem_usage=False, # Force real memory
	torch_dtype=torch.float32
	)
	missing_keys, unexpected_keys = model.load_state_dict(base_model_temp.state_dict(), strict=False)
	base_model_temp = base_model_temp.to(DEVICE)
	## Test REMEMBER: SET VALID SIZE = 1. Comment out when normal running
	## compare_models(model, base_model_temp, base_model_name)
	del base_model_temp
	torch.cuda.empty_cache()
	# model, loading_info = IbaXs_LlamaForCausalLM.from_pretrained(base_model_name, config=config,
	# output_loading_info=True,
	# dtype=torch.float32,low_cpu_mem_usage=False,device_map=None
	# )
	# model = model.to('cuda')
	# missing_keys = loading_info.get("missing_keys", [])
	# unexpected_keys = loading_info.get("unexpected_keys", [])
	if missing_keys:
	print('missing_keys:')
	for key in (missing_keys):
	if 'layers' in key and 'hypernetxs' not in key and 'layer_idx_hyperxs' not in key:
	print(f" missing: [x] {key}")
	else:
	print("\n>>> No missing keys.")
	if unexpected_keys:
	for key in unexpected_keys:
	print(f" [?] {key}")
	else:
	print("\n>>> No unexpected keys.")
	return model
	def compare_models(custom_model, ref_model, base_model_name, device="cuda"):
	"""
	Compares logits between the custom IbaXs model and the original Llama 2.
	REMEMBER: SET VALID SIZE = 1
	"""
	def setup_precise_gpu_environment():
	"""
	Configures PyTorch to prioritize numerical precision over speed on GPU.
	This helps in matching GPU results with CPU results for debugging purposes.
	"""

	# 1. DISABLE TensorFloat-32 (TF32)
	# By default, newer NVIDIA GPUs (Ampere+) use TF32 for matmul/conv,
	# which sacrifices precision for speed.
	# We disable it to force true Float32 calculations.
	torch.backends.cuda.matmul.allow_tf32 = False
	torch.backends.cudnn.allow_tf32 = False

	# 2. ENFORCE Deterministic Algorithms (Optional but Recommended)
	# Some CUDA operations are non-deterministic (e.g., atomic additions).
	# This forces PyTorch to use deterministic algorithms where possible.
	torch.backends.cudnn.benchmark = False
	torch.backends.cudnn.deterministic = True

	# Note: If you face errors like "deterministic algorithm not found",
	# you might need to set the environment variable: CUBLAS_WORKSPACE_CONFIG=:4096:8
	# torch.use_deterministic_algorithms(True)

	print(">> GPU Precision Setup: TF32 Disabled. Deterministic Mode set (partial).")
	setup_precise_gpu_environment()

	print(f"\n--- Starting Comparison on {device} {custom_model.dtype} {ref_model.dtype}---")
	# ref_model = ref_model.to(device)
	# custom_model = custom_model.to(device)
	ref_model.eval()
	custom_model.eval() # Set your model to eval mode

	# 2. Prepare dummy input
	tokenizer = AutoTokenizer.from_pretrained(base_model_name)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	text = "Hello, this is a test for model comparison."
	inputs = tokenizer(text, return_tensors="pt").to(device)

	# Ensure inputs are on the same device as the reference model's first layer
	ref_inputs = inputs.to(ref_model.device)

	# 3. Forward pass (No gradients needed)
	with torch.no_grad():
	print("Running inference on Custom Model...")
	logits_custom = custom_model(**inputs).logits

	print("Running inference on Reference Model...")
	logits_ref = ref_model(**ref_inputs).logits

	# 4. Compare results
	# Move both to CPU for comparison to avoid device mismatch errors
	diff = (logits_custom.cpu() - logits_ref.cpu()).abs()
	max_diff = diff.max().item()
	mean_diff = diff.mean().item()

	print("\n--- Comparison Results ---")
	print(f"Max Absolute Difference: {max_diff:.6f}")
	print(f"Mean Absolute Difference: {mean_diff:.6f}")

	# Check first few logits of the last token
	print("\nFirst 5 logits (Last Token):")
	print(f"Custom: {logits_custom[0, -1, :5].cpu().tolist()}")
	print(f"Ref : {logits_ref[0, -1, :5].cpu().tolist()}")

	if max_diff < 1e-3:
	print(">> VERDICT: Models are effectively IDENTICAL.")
	else:
	print(">> VERDICT: Models are DIFFERENT (Expected if custom layers are random initialized).")

	# Clean up reference model to free memory
	del ref_model
	torch.cuda.empty_cache()

	class GradientInspector:
	"""
	A debugging tool to attach hooks to PyTorch modules.
	It prints the gradient norm flowing through specific layers during backward pass.
	"""

	def __init__(self):
	self.hooks = []

	def print_grad_stats(self, module, grad_input, grad_output):
	"""
	Callback function triggered during backward pass.
	"""
	from tqdm import tqdm
	# module_name is stored in the module object for identification
	name = getattr(module, 'debug_name', 'Unknown Layer')

	# Check Output Gradients (Gradients coming from the Loss towards this layer)
	if grad_output[0] is not None:
	out_norm = grad_output[0].norm().item()
	tqdm.write(f"[DEBUG-BACKWARD] {name} \| Output Grad Norm (from upstream): {out_norm:.6f}")
	else:
	tqdm.write(f"[DEBUG-BACKWARD] {name} \| Output Grad is None!")

	# Check Input Gradients (Gradients passing through this layer to the next)
	# Note: In backward pass, "input" usually refers to the gradients w.r.t weights or previous layer outputs
	if grad_input[0] is not None:
	in_norm = grad_input[0].norm().item()
	msg = (f"[DEBUG-BACKWARD] {name} \| Input Grad Norm (passing downstream): {in_norm:.6f}")

	tqdm.write(msg)

	if in_norm == 0:
	tqdm.write(f" >>> ALARM: Gradient died at {name}!")
	else:
	# Some layers (like input embeddings) might have None grad_input at the very end
	pass

	def register_hooks(self, model):
	from tqdm import tqdm
	"""
	Recursively attach hooks to important modules.
	"""
	tqdm.write("Registering debug hooks...")

	# 1. Hook into the Hypernetwork Output (The most critical bridge)
	# Assuming model.hypernet is your hypernetwork instance
	if hasattr(model.model, 'hypernetxs'):
	model.model.hypernetxs.debug_name = "HyperNetwork_Top"
	# Hook the whole hypernet module
	handle = model.model.hypernetxs.register_full_backward_hook(self.print_grad_stats)
	self.hooks.append(handle)

	# Hook specifically the last linear layer of hypernet to see if weights get update
	if hasattr(model.model.hypernetxs, 'c_proj'):
	last_layer = model.model.hypernetxs.c_proj
	last_layer.debug_name = "HyperNetwork_Last_Linear"
	handle = last_layer.register_full_backward_hook(self.print_grad_stats)
	self.hooks.append(handle)

	# 2. Hook into a few Dynamic Layers (e.g., the first and last one)
	# Assuming you used the wrapper or replaced layers in base_model
	count = 0
	for name, module in model.named_modules():
	# Adjust 'DynamicSVDLinear' to match your actual class name
	if "Linear" in str(type(module)):
	if count == 0: # First dynamic layer
	module.debug_name = f"DynamicLayer_First_{name}"
	handle = module.register_full_backward_hook(self.print_grad_stats)
	self.hooks.append(handle)
	# You can add logic to hook the last one too
	count += 1

	print(f"Registered {len(self.hooks)} hooks.")

	def clear_hooks(self):
	for h in self.hooks:
	h.remove()

	def reset_trainable_modules(model):
	for name, module in model.named_modules():
	if isinstance(module, HyperNetXSexp) or isinstance(module, IbaXs_LlamaModel):
	if hasattr(module, 'reset_parameters'):
	module.reset_parameters()
	print('reset: ', name)
	return model


	def trainIBA(config, main_cfg):
	training_cfg = main_cfg.training
	data_cfg = main_cfg.data

	valid_hf_arg_names = set(inspect.signature(TrainingArguments).parameters.keys())
	training_config_dict = asdict(training_cfg)
	filtered_trainer_args_dict = {
	key: value for key, value in training_config_dict.items()
	if key in valid_hf_arg_names
	}
	trainer_args = TrainingArguments(**filtered_trainer_args_dict)

	gradient_accumulation_steps = training_cfg.gradient_accumulation_steps

	device_map = "auto"
	world_size = int(os.environ.get("WORLD_SIZE", 1))
	ddp = world_size != 1
	if ddp:
	device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
	gradient_accumulation_steps = gradient_accumulation_steps // world_size

	base_model_name = main_cfg.model.base_model_name
	# A ramdom model to debug
	# with torch.no_grad():
	# torch.set_default_device('cuda')
	# model = IbaXs_LlamaForCausalLM(config=config) # test
	# torch.set_default_device('cpu')

	# SVD caluation for each rank.
	if False:
	model = get_hyper_model(config=config, base_model_name=base_model_name)
	# print('device', model.device)
	mark_iba_as_trainable_only(model)
	count_parameters(model)
	model.reset_BA_xslora()
	model.save_pretrained('./SVD64_llama2', safe_serialization=False)
	exit()
	else:
	hf_logging.set_verbosity_error()
	model = IbaXs_LlamaForCausalLM.from_pretrained(
	'./SVD64_llama2',
	device_map="auto",
	dtype=torch.bfloat16,
	config=config,
	local_files_only=True, # Strictly force loading from local, no internet check for config
	ignore_mismatched_sizes=True
	)
	hf_logging.set_verbosity_warning()
	# reset trainable hypernets
	model = reset_trainable_modules(model)
	mark_iba_as_trainable_only(model)
	count_parameters(model)
	# for n, p in model.named_parameters():
	# if 'hypernetxs' not in n:
	# print(f'n = {n}, shape {p.shape}')
	# print(model)

	if config.model_type == 'llama':
	# Due to the name of transformers' LlamaTokenizer, we have to do this
	# need to handle llama 3 separately
	if "lama-3" in base_model_name:
	print("load llama-3 tokenizer")
	tokenizer = AutoTokenizer.from_pretrained(base_model_name)
	else:
	tokenizer = LlamaTokenizer.from_pretrained(base_model_name, legacy=True)
	else:
	tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)

	tokenizer.pad_token_id = (
	0 # unk. we want this to be different from the eos token
	)

	tokenizer.padding_side = "left" # Allow batched inference

	def tokenize(prompt, max_length=main_cfg.model.cutoff_len, add_eos_token=True):
	result = tokenizer(
	prompt,
	truncation=True,
	max_length=main_cfg.model.cutoff_len,
	padding=False,
	return_tensors=None,
	)
	if (
	result["input_ids"][-1] != tokenizer.eos_token_id
	and len(result["input_ids"]) < max_length
	and add_eos_token
	):
	result["input_ids"].append(tokenizer.eos_token_id)
	if "chatglm" not in base_model_name:
	result["attention_mask"].append(1)

	result["labels"] = result["input_ids"].copy()

	if "chatglm" in base_model_name:
	return {"input_ids": result["input_ids"], "labels": result["labels"]}
	else:
	return result

	def generate_and_tokenize_prompt(data_point):
	instruction = data_point.get("instruction", "")
	inp = data_point.get("input", "")
	target_output = data_point.get("output", "") # "the correct answer is true"

	# Match your EVAL template exactly
	input_section = f"### Input:\n{inp}\n\n" if inp and str(inp).strip() else ""

	source_text = PROMPT_TEMPLATE.format(
	instruction=instruction,
	input_section=input_section
	)
	full_text = source_text + target_output + tokenizer.eos_token

	tokenized_full = tokenizer(full_text, truncation=True, max_length=main_cfg.model.cutoff_len, padding=False)

	if not main_cfg.model.train_on_inputs:
	tokenized_source = tokenizer(source_text, truncation=True, max_length=main_cfg.model.cutoff_len, padding=False)
	source_len = len(tokenized_source["input_ids"])
	# Ensure we don't mask the entire sequence
	labels = [-100] * source_len + tokenized_full["input_ids"][source_len:]
	tokenized_full["labels"] = labels

	return tokenized_full

	# outdated
	def generate_and_tokenize_prompt3(data_point):
	"""
	Standardizes training data to match Eval template and handles label masking.
	"""
	instruction = data_point.get("instruction", "")
	inp = data_point.get("input", "")
	output = data_point.get("output", "") # The target we want to train on

	# 1. Format Input Section
	if inp and str(inp).strip():
	input_section = f"### Input:\n{inp}\n\n"
	else:
	input_section = ""

	# 2. Build Source (Prompt) and Full Text
	source_text = PROMPT_TEMPLATE.format(
	instruction=instruction,
	input_section=input_section
	)
	full_text = source_text + output + tokenizer.eos_token

	# 3. Tokenize
	tokenized_full = tokenizer(
	full_text,
	truncation=True,
	max_length=main_cfg.model.cutoff_len,
	padding=False,
	)

	# 4. Handle Labels (Masking the Instruction part)
	# Only calculate loss on the 'output' part
	if not training_cfg.train_on_inputs:
	tokenized_source = tokenizer(
	source_text,
	truncation=True,
	max_length=main_cfg.model.cutoff_len,
	padding=False,
	)
	source_len = len(tokenized_source["input_ids"])

	# Mask prompt tokens with -100 so they are ignored by CrossEntropyLoss
	tokenized_full["labels"] = [
	-100 if i < source_len else token_id
	for i, token_id in enumerate(tokenized_full["input_ids"])
	]
	else:
	tokenized_full["labels"] = tokenized_full["input_ids"].copy()

	return tokenized_full


	if data_cfg.data_path.endswith(".json"):
	data = load_dataset("json", data_files=data_cfg.data_path)
	else:
	data = load_dataset(data_cfg.data_path)

	### Check later
	if training_cfg.resume_from_checkpoint:
	# Check the available weights and load them
	checkpoint_name = os.path.join(
	resume_from_checkpoint, "pytorch_model.bin"
	) # Full checkpoint
	if not os.path.exists(checkpoint_name):
	checkpoint_name = os.path.join(
	resume_from_checkpoint, "adapter_model.bin"
	) # only LoRA model - LoRA config above has to fit
	resume_from_checkpoint = (
	False # So the trainer won't try loading its state
	)
	# The two files above have a different name depending on how they were saved, but are actually the same.
	if os.path.exists(checkpoint_name):
	print(f"Restarting from {checkpoint_name}")
	model = IbaXs_LlamaModel.from_pretrained("./my-saved-model")
	else:
	print(f"Checkpoint {checkpoint_name} not found")

	if main_cfg.data.val_set_size > 0:
	train_val = data["train"].train_test_split(
	test_size=main_cfg.data.val_set_size, shuffle=True, seed=42
	)
	train_data = (
	train_val["train"].map(generate_and_tokenize_prompt, num_proc=8)
	)
	val_data = (
	train_val["test"].map(generate_and_tokenize_prompt)
	)
	else:
	train_data = data["train"].shuffle().map(generate_and_tokenize_prompt, num_proc=8)
	val_data = None
	print('data size', len(train_data), len(val_data))

	# print('val data', type(val_data), val_data)
	# for k,v in val_data[0].items():
	# print('kv', k, ': ', v)
	# exit()
	# count_parameters(model)

	# Gradient debug
	# inspector = GradientInspector()
	# inspector.register_hooks(model)

	start_time = datetime.now()
	date_str = start_time.strftime("%dd%Hh%Mm%S")
	output_dir = f'{trainer_args.output_dir}/{main_cfg.data.dataset_name}/'\
	f't={date_str},' \
	f'mlr{trainer_args.learning_rate:.1e},'\
	f'b{trainer_args.per_device_train_batch_size},'\
	f'r{main_cfg.hyperxs.lora_attn_dim},n_ct{main_cfg.hyperxs.n_cross_attn_tokens},'\
	f't{date_str},' \
	f'init{main_cfg.run_text},dr{main_cfg.hyperxs.drop_out},'\
	f'ep{trainer_args.num_train_epochs},' \
	f'ds{len(train_data)}'

	trainer_args.output_dir=output_dir
	print(f'Current output_dir: {output_dir}')
	# trainer_args.run_name = f'[{next_run_num}]'\
	# f't={date_str}', \
	# f'mlr{trainer_args.learning_rate:.1e},'\
	# f'b{trainer_args.per_device_train_batch_size},'\
	# f'r{main_cfg.hyperxs.lora_attn_dim},n_ct{main_cfg.hyperxs.n_cross_attn_tokens},'\
	# f't{date_str},' \
	# f'init={main_cfg.run_text},dr{main_cfg.hyperxs.drop_out},'\
	# f'ep{trainer_args.num_train_epochs},' \
	# f'ds={len(train_data)}'
	# print('Run nume: ', trainer_args.run_name)

	trainer = transformers.Trainer(
	model=model,
	train_dataset=train_data,
	eval_dataset=val_data,
	args=trainer_args,
	data_collator=transformers.DataCollatorForSeq2Seq(
	tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
	),
	)
	model.config.use_cache = False

	# trainer.train(resume_from_checkpoint=training_cfg.resume_from_checkpoint)
	trainer.train()
	end_time = datetime.now()
	print('end time: ', end_time.strftime("%Y-%m-%d %H:%M:%S"), '\| duration: ', end_time - start_time)

	tokenizer.save_pretrained(os.path.join(trainer_args.output_dir, 'ft'))
	trainer.save_state()
	config.save_pretrained(os.path.join(trainer_args.output_dir, 'ft'))
	model.save_pretrained(os.path.join(trainer_args.output_dir, 'ft2'), safe_serialization=False)
	# inspector.clear_hooks()


	@draccus.wrap(config_path="./config_draccus/config.yaml")
	def main(main_cfg: MainConfig):
	# print('Hello\n', main_cfg)
	main_cfg_dict = asdict(main_cfg)
	# print(yaml.dump(main_cfg_dict, indent=2, default_flow_style=False))

	config = AutoConfig.from_pretrained(
	main_cfg.model.base_model_name,
	# attn_implementation="eager",
	)

	# config.hidden_size=128
	# config.intermediate_size=290
	# config.num_hidden_layers=3
	# # config._attn_implementation = "eager"
	# config.head_dim = config.hidden_size // config.num_attention_heads

	# main_cfg_dict = asdict(main_cfg)
	config.main_cfg = main_cfg_dict
	set_seed(main_cfg.seed)
	trainIBA(config, main_cfg)



	if __name__ == "__main__":
	main()