nvan13's picture
Upload folder using huggingface_hub
a0d95b0 verified
import torch
import yaml
from dataclasses import asdict
import draccus
from datasets import load_dataset
import os
import transformers
from transformers import (AutoModelForCausalLM, AutoTokenizer,
LlamaTokenizer, AutoModel, AutoConfig,
TrainingArguments)
import inspect
from transformers import logging as hf_logging
import random
import numpy as np
from datetime import datetime
# from XS_llama import IbaXs_LlamaModel, IbaXs_LlamaForCausalLM
# from utils import count_parameters
# from .configIBA import MainConfig
from iba import (IbaXs_LlamaModel, IbaXs_LlamaForCausalLM,
HyperNetXSexp,
count_parameters, MainConfig, mark_iba_as_trainable_only
)
from transformers.models.llama.modeling_llama import (
LlamaMLP,
LlamaAttention,
LlamaDecoderLayer,
LlamaModel,
LlamaForCausalLM
)
PROMPT_TEMPLATE = (
"Below is an instruction that describes a task. "
"Write a response that appropriately completes the request.\n\n"
"### Instruction:\n{instruction}\n\n{input_section}"
"### Response:\n"
)
# Register 'TrainConfig' as the schema for the config named 'config'
DEVICE = 'cuda'
# torch.compile = lambda model, *args, **kwargs: model
def set_seed(seed: int):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
transformers.set_seed(seed)
def test_generate(config, main_cfg):
###
base_model_name = main_cfg.model.base_model_name
if config.model_type == 'llama':
# Due to the name of transformers' LlamaTokenizer, we have to do this
# need to handle llama 3 separately
if "lama-3" in base_model_name:
print("load llama-3 tokenizer")
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
else:
tokenizer = LlamaTokenizer.from_pretrained(base_model_name, legacy=True)
else:
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
model = IbaXs_LlamaForCausalLM(config=config).to(DEVICE)
model.eval()
prompts = [
"The capital of France is",
#"Here is a simple Python function to add two numbers:"
]
for i, prompt in enumerate(prompts):
print(f"\n--- Prompt {i+1} ---")
print(f"Input: {prompt}")
# 4.1. Tokenize the Input
# Convert the prompt string to PyTorch tensors
inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
# 4.2. Generate Text
# Use torch.no_grad() for inference
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=4, # Generate up to 50 new tokens
do_sample=True,
temperature=0.7,
top_k=50
# Note: We don't need 'add_generation_prompt' here
)
# 4.3. Decode the Output
# The output includes the prompt, so we slice it
output_tokens = outputs[0][inputs["input_ids"].shape[1]:]
generated_text = tokenizer.decode(output_tokens, skip_special_tokens=True)
print(f"Output: {generated_text}")
def get_hyper_model(config, base_model_name):
# Avoid to init on cpu
with torch.no_grad():
torch.set_default_device('cpu')
model = IbaXs_LlamaForCausalLM(config=config) # test
torch.set_default_device('cpu')
# Workaround to meta tensor on cuda issue.
transformers.logging.set_verbosity_error()
base_model_temp = LlamaForCausalLM.from_pretrained(
base_model_name,
config=config,
device_map=None, # Strictly None
low_cpu_mem_usage=False, # Force real memory
torch_dtype=torch.float32
)
missing_keys, unexpected_keys = model.load_state_dict(base_model_temp.state_dict(), strict=False)
base_model_temp = base_model_temp.to(DEVICE)
## Test REMEMBER: SET VALID SIZE = 1. Comment out when normal running
## compare_models(model, base_model_temp, base_model_name)
del base_model_temp
torch.cuda.empty_cache()
# model, loading_info = IbaXs_LlamaForCausalLM.from_pretrained(base_model_name, config=config,
# output_loading_info=True,
# dtype=torch.float32,low_cpu_mem_usage=False,device_map=None
# )
# model = model.to('cuda')
# missing_keys = loading_info.get("missing_keys", [])
# unexpected_keys = loading_info.get("unexpected_keys", [])
if missing_keys:
print('missing_keys:')
for key in (missing_keys):
if 'layers' in key and 'hypernetxs' not in key and 'layer_idx_hyperxs' not in key:
print(f" missing: [x] {key}")
else:
print("\n>>> No missing keys.")
if unexpected_keys:
for key in unexpected_keys:
print(f" [?] {key}")
else:
print("\n>>> No unexpected keys.")
return model
def compare_models(custom_model, ref_model, base_model_name, device="cuda"):
"""
Compares logits between the custom IbaXs model and the original Llama 2.
REMEMBER: SET VALID SIZE = 1
"""
def setup_precise_gpu_environment():
"""
Configures PyTorch to prioritize numerical precision over speed on GPU.
This helps in matching GPU results with CPU results for debugging purposes.
"""
# 1. DISABLE TensorFloat-32 (TF32)
# By default, newer NVIDIA GPUs (Ampere+) use TF32 for matmul/conv,
# which sacrifices precision for speed.
# We disable it to force true Float32 calculations.
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False
# 2. ENFORCE Deterministic Algorithms (Optional but Recommended)
# Some CUDA operations are non-deterministic (e.g., atomic additions).
# This forces PyTorch to use deterministic algorithms where possible.
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
# Note: If you face errors like "deterministic algorithm not found",
# you might need to set the environment variable: CUBLAS_WORKSPACE_CONFIG=:4096:8
# torch.use_deterministic_algorithms(True)
print(">> GPU Precision Setup: TF32 Disabled. Deterministic Mode set (partial).")
setup_precise_gpu_environment()
print(f"\n--- Starting Comparison on {device} {custom_model.dtype} {ref_model.dtype}---")
# ref_model = ref_model.to(device)
# custom_model = custom_model.to(device)
ref_model.eval()
custom_model.eval() # Set your model to eval mode
# 2. Prepare dummy input
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
text = "Hello, this is a test for model comparison."
inputs = tokenizer(text, return_tensors="pt").to(device)
# Ensure inputs are on the same device as the reference model's first layer
ref_inputs = inputs.to(ref_model.device)
# 3. Forward pass (No gradients needed)
with torch.no_grad():
print("Running inference on Custom Model...")
logits_custom = custom_model(**inputs).logits
print("Running inference on Reference Model...")
logits_ref = ref_model(**ref_inputs).logits
# 4. Compare results
# Move both to CPU for comparison to avoid device mismatch errors
diff = (logits_custom.cpu() - logits_ref.cpu()).abs()
max_diff = diff.max().item()
mean_diff = diff.mean().item()
print("\n--- Comparison Results ---")
print(f"Max Absolute Difference: {max_diff:.6f}")
print(f"Mean Absolute Difference: {mean_diff:.6f}")
# Check first few logits of the last token
print("\nFirst 5 logits (Last Token):")
print(f"Custom: {logits_custom[0, -1, :5].cpu().tolist()}")
print(f"Ref : {logits_ref[0, -1, :5].cpu().tolist()}")
if max_diff < 1e-3:
print(">> VERDICT: Models are effectively IDENTICAL.")
else:
print(">> VERDICT: Models are DIFFERENT (Expected if custom layers are random initialized).")
# Clean up reference model to free memory
del ref_model
torch.cuda.empty_cache()
class GradientInspector:
"""
A debugging tool to attach hooks to PyTorch modules.
It prints the gradient norm flowing through specific layers during backward pass.
"""
def __init__(self):
self.hooks = []
def print_grad_stats(self, module, grad_input, grad_output):
"""
Callback function triggered during backward pass.
"""
from tqdm import tqdm
# module_name is stored in the module object for identification
name = getattr(module, 'debug_name', 'Unknown Layer')
# Check Output Gradients (Gradients coming from the Loss towards this layer)
if grad_output[0] is not None:
out_norm = grad_output[0].norm().item()
tqdm.write(f"[DEBUG-BACKWARD] {name} | Output Grad Norm (from upstream): {out_norm:.6f}")
else:
tqdm.write(f"[DEBUG-BACKWARD] {name} | Output Grad is None!")
# Check Input Gradients (Gradients passing through this layer to the next)
# Note: In backward pass, "input" usually refers to the gradients w.r.t weights or previous layer outputs
if grad_input[0] is not None:
in_norm = grad_input[0].norm().item()
msg = (f"[DEBUG-BACKWARD] {name} | Input Grad Norm (passing downstream): {in_norm:.6f}")
tqdm.write(msg)
if in_norm == 0:
tqdm.write(f" >>> ALARM: Gradient died at {name}!")
else:
# Some layers (like input embeddings) might have None grad_input at the very end
pass
def register_hooks(self, model):
from tqdm import tqdm
"""
Recursively attach hooks to important modules.
"""
tqdm.write("Registering debug hooks...")
# 1. Hook into the Hypernetwork Output (The most critical bridge)
# Assuming model.hypernet is your hypernetwork instance
if hasattr(model.model, 'hypernetxs'):
model.model.hypernetxs.debug_name = "HyperNetwork_Top"
# Hook the whole hypernet module
handle = model.model.hypernetxs.register_full_backward_hook(self.print_grad_stats)
self.hooks.append(handle)
# Hook specifically the last linear layer of hypernet to see if weights get update
if hasattr(model.model.hypernetxs, 'c_proj'):
last_layer = model.model.hypernetxs.c_proj
last_layer.debug_name = "HyperNetwork_Last_Linear"
handle = last_layer.register_full_backward_hook(self.print_grad_stats)
self.hooks.append(handle)
# 2. Hook into a few Dynamic Layers (e.g., the first and last one)
# Assuming you used the wrapper or replaced layers in base_model
count = 0
for name, module in model.named_modules():
# Adjust 'DynamicSVDLinear' to match your actual class name
if "Linear" in str(type(module)):
if count == 0: # First dynamic layer
module.debug_name = f"DynamicLayer_First_{name}"
handle = module.register_full_backward_hook(self.print_grad_stats)
self.hooks.append(handle)
# You can add logic to hook the last one too
count += 1
print(f"Registered {len(self.hooks)} hooks.")
def clear_hooks(self):
for h in self.hooks:
h.remove()
def reset_trainable_modules(model):
for name, module in model.named_modules():
if isinstance(module, HyperNetXSexp) or isinstance(module, IbaXs_LlamaModel):
if hasattr(module, 'reset_parameters'):
module.reset_parameters()
print('reset: ', name)
return model
def trainIBA(config, main_cfg):
training_cfg = main_cfg.training
data_cfg = main_cfg.data
valid_hf_arg_names = set(inspect.signature(TrainingArguments).parameters.keys())
training_config_dict = asdict(training_cfg)
filtered_trainer_args_dict = {
key: value for key, value in training_config_dict.items()
if key in valid_hf_arg_names
}
trainer_args = TrainingArguments(**filtered_trainer_args_dict)
gradient_accumulation_steps = training_cfg.gradient_accumulation_steps
device_map = "auto"
world_size = int(os.environ.get("WORLD_SIZE", 1))
ddp = world_size != 1
if ddp:
device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
gradient_accumulation_steps = gradient_accumulation_steps // world_size
base_model_name = main_cfg.model.base_model_name
# A ramdom model to debug
# with torch.no_grad():
# torch.set_default_device('cuda')
# model = IbaXs_LlamaForCausalLM(config=config) # test
# torch.set_default_device('cpu')
# SVD caluation for each rank.
if False:
model = get_hyper_model(config=config, base_model_name=base_model_name)
# print('device', model.device)
mark_iba_as_trainable_only(model)
count_parameters(model)
model.reset_BA_xslora()
model.save_pretrained('./SVD64_llama2', safe_serialization=False)
exit()
else:
hf_logging.set_verbosity_error()
model = IbaXs_LlamaForCausalLM.from_pretrained(
'./SVD64_llama2',
device_map="auto",
dtype=torch.bfloat16,
config=config,
local_files_only=True, # Strictly force loading from local, no internet check for config
ignore_mismatched_sizes=True
)
hf_logging.set_verbosity_warning()
# reset trainable hypernets
model = reset_trainable_modules(model)
mark_iba_as_trainable_only(model)
count_parameters(model)
# for n, p in model.named_parameters():
# if 'hypernetxs' not in n:
# print(f'n = {n}, shape {p.shape}')
# print(model)
if config.model_type == 'llama':
# Due to the name of transformers' LlamaTokenizer, we have to do this
# need to handle llama 3 separately
if "lama-3" in base_model_name:
print("load llama-3 tokenizer")
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
else:
tokenizer = LlamaTokenizer.from_pretrained(base_model_name, legacy=True)
else:
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token_id = (
0 # unk. we want this to be different from the eos token
)
tokenizer.padding_side = "left" # Allow batched inference
def tokenize(prompt, max_length=main_cfg.model.cutoff_len, add_eos_token=True):
result = tokenizer(
prompt,
truncation=True,
max_length=main_cfg.model.cutoff_len,
padding=False,
return_tensors=None,
)
if (
result["input_ids"][-1] != tokenizer.eos_token_id
and len(result["input_ids"]) < max_length
and add_eos_token
):
result["input_ids"].append(tokenizer.eos_token_id)
if "chatglm" not in base_model_name:
result["attention_mask"].append(1)
result["labels"] = result["input_ids"].copy()
if "chatglm" in base_model_name:
return {"input_ids": result["input_ids"], "labels": result["labels"]}
else:
return result
def generate_and_tokenize_prompt(data_point):
instruction = data_point.get("instruction", "")
inp = data_point.get("input", "")
target_output = data_point.get("output", "") # "the correct answer is true"
# Match your EVAL template exactly
input_section = f"### Input:\n{inp}\n\n" if inp and str(inp).strip() else ""
source_text = PROMPT_TEMPLATE.format(
instruction=instruction,
input_section=input_section
)
full_text = source_text + target_output + tokenizer.eos_token
tokenized_full = tokenizer(full_text, truncation=True, max_length=main_cfg.model.cutoff_len, padding=False)
if not main_cfg.model.train_on_inputs:
tokenized_source = tokenizer(source_text, truncation=True, max_length=main_cfg.model.cutoff_len, padding=False)
source_len = len(tokenized_source["input_ids"])
# Ensure we don't mask the entire sequence
labels = [-100] * source_len + tokenized_full["input_ids"][source_len:]
tokenized_full["labels"] = labels
return tokenized_full
# outdated
def generate_and_tokenize_prompt3(data_point):
"""
Standardizes training data to match Eval template and handles label masking.
"""
instruction = data_point.get("instruction", "")
inp = data_point.get("input", "")
output = data_point.get("output", "") # The target we want to train on
# 1. Format Input Section
if inp and str(inp).strip():
input_section = f"### Input:\n{inp}\n\n"
else:
input_section = ""
# 2. Build Source (Prompt) and Full Text
source_text = PROMPT_TEMPLATE.format(
instruction=instruction,
input_section=input_section
)
full_text = source_text + output + tokenizer.eos_token
# 3. Tokenize
tokenized_full = tokenizer(
full_text,
truncation=True,
max_length=main_cfg.model.cutoff_len,
padding=False,
)
# 4. Handle Labels (Masking the Instruction part)
# Only calculate loss on the 'output' part
if not training_cfg.train_on_inputs:
tokenized_source = tokenizer(
source_text,
truncation=True,
max_length=main_cfg.model.cutoff_len,
padding=False,
)
source_len = len(tokenized_source["input_ids"])
# Mask prompt tokens with -100 so they are ignored by CrossEntropyLoss
tokenized_full["labels"] = [
-100 if i < source_len else token_id
for i, token_id in enumerate(tokenized_full["input_ids"])
]
else:
tokenized_full["labels"] = tokenized_full["input_ids"].copy()
return tokenized_full
if data_cfg.data_path.endswith(".json"):
data = load_dataset("json", data_files=data_cfg.data_path)
else:
data = load_dataset(data_cfg.data_path)
### Check later
if training_cfg.resume_from_checkpoint:
# Check the available weights and load them
checkpoint_name = os.path.join(
resume_from_checkpoint, "pytorch_model.bin"
) # Full checkpoint
if not os.path.exists(checkpoint_name):
checkpoint_name = os.path.join(
resume_from_checkpoint, "adapter_model.bin"
) # only LoRA model - LoRA config above has to fit
resume_from_checkpoint = (
False # So the trainer won't try loading its state
)
# The two files above have a different name depending on how they were saved, but are actually the same.
if os.path.exists(checkpoint_name):
print(f"Restarting from {checkpoint_name}")
model = IbaXs_LlamaModel.from_pretrained("./my-saved-model")
else:
print(f"Checkpoint {checkpoint_name} not found")
if main_cfg.data.val_set_size > 0:
train_val = data["train"].train_test_split(
test_size=main_cfg.data.val_set_size, shuffle=True, seed=42
)
train_data = (
train_val["train"].map(generate_and_tokenize_prompt, num_proc=8)
)
val_data = (
train_val["test"].map(generate_and_tokenize_prompt)
)
else:
train_data = data["train"].shuffle().map(generate_and_tokenize_prompt, num_proc=8)
val_data = None
print('data size', len(train_data), len(val_data))
# print('val data', type(val_data), val_data)
# for k,v in val_data[0].items():
# print('kv', k, ': ', v)
# exit()
# count_parameters(model)
# Gradient debug
# inspector = GradientInspector()
# inspector.register_hooks(model)
start_time = datetime.now()
date_str = start_time.strftime("%dd%Hh%Mm%S")
output_dir = f'{trainer_args.output_dir}/{main_cfg.data.dataset_name}/'\
f't={date_str},' \
f'mlr{trainer_args.learning_rate:.1e},'\
f'b{trainer_args.per_device_train_batch_size},'\
f'r{main_cfg.hyperxs.lora_attn_dim},n_ct{main_cfg.hyperxs.n_cross_attn_tokens},'\
f't{date_str},' \
f'init{main_cfg.run_text},dr{main_cfg.hyperxs.drop_out},'\
f'ep{trainer_args.num_train_epochs},' \
f'ds{len(train_data)}'
trainer_args.output_dir=output_dir
print(f'Current output_dir: {output_dir}')
# trainer_args.run_name = f'[{next_run_num}]'\
# f't={date_str}', \
# f'mlr{trainer_args.learning_rate:.1e},'\
# f'b{trainer_args.per_device_train_batch_size},'\
# f'r{main_cfg.hyperxs.lora_attn_dim},n_ct{main_cfg.hyperxs.n_cross_attn_tokens},'\
# f't{date_str},' \
# f'init={main_cfg.run_text},dr{main_cfg.hyperxs.drop_out},'\
# f'ep{trainer_args.num_train_epochs},' \
# f'ds={len(train_data)}'
# print('Run nume: ', trainer_args.run_name)
trainer = transformers.Trainer(
model=model,
train_dataset=train_data,
eval_dataset=val_data,
args=trainer_args,
data_collator=transformers.DataCollatorForSeq2Seq(
tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
),
)
model.config.use_cache = False
# trainer.train(resume_from_checkpoint=training_cfg.resume_from_checkpoint)
trainer.train()
end_time = datetime.now()
print('end time: ', end_time.strftime("%Y-%m-%d %H:%M:%S"), '| duration: ', end_time - start_time)
tokenizer.save_pretrained(os.path.join(trainer_args.output_dir, 'ft'))
trainer.save_state()
config.save_pretrained(os.path.join(trainer_args.output_dir, 'ft'))
model.save_pretrained(os.path.join(trainer_args.output_dir, 'ft2'), safe_serialization=False)
# inspector.clear_hooks()
@draccus.wrap(config_path="./config_draccus/config.yaml")
def main(main_cfg: MainConfig):
# print('Hello\n', main_cfg)
main_cfg_dict = asdict(main_cfg)
# print(yaml.dump(main_cfg_dict, indent=2, default_flow_style=False))
config = AutoConfig.from_pretrained(
main_cfg.model.base_model_name,
# attn_implementation="eager",
)
# config.hidden_size=128
# config.intermediate_size=290
# config.num_hidden_layers=3
# # config._attn_implementation = "eager"
# config.head_dim = config.hidden_size // config.num_attention_heads
# main_cfg_dict = asdict(main_cfg)
config.main_cfg = main_cfg_dict
set_seed(main_cfg.seed)
trainIBA(config, main_cfg)
if __name__ == "__main__":
main()