nvan13's picture
Upload folder using huggingface_hub
a0d95b0 verified
raw
history blame
11.2 kB
import hydra
from hydra.core.config_store import ConfigStore
from omegaconf import OmegaConf, DictConfig
import torch
import yaml
from dataclasses import asdict
from datasets import load_dataset
import os
import transformers
from transformers import (AutoModelForCausalLM, AutoTokenizer,
LlamaTokenizer, AutoModel, AutoConfig,
TrainingArguments)
import inspect
import random
import numpy as np
# from XS_llama import IbaXs_LlamaModel, IbaXs_LlamaForCausalLM
# from utils import count_parameters
# from .configIBA import MainConfig
from iba import IbaXs_LlamaModel, IbaXs_LlamaForCausalLM, count_parameters, MainConfig
from transformers.models.llama.modeling_llama import (
LlamaMLP,
LlamaAttention,
LlamaDecoderLayer,
LlamaModel,
LlamaForCausalLM
)
# Create the ConfigStore instance
cs = ConfigStore.instance()
# Register 'TrainConfig' as the schema for the config named 'config'
# (tên 'config' này khớp với 'config_name="config"' trong decorator)
cs.store(name="main_schema", node=MainConfig)
DEVICE = 'cuda'
def set_seed(seed: int):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
transformers.set_seed(seed)
def test_generate(config, main_cfg):
###
base_model_name = main_cfg.model.base_model_name
if config.model_type == 'llama':
# Due to the name of transformers' LlamaTokenizer, we have to do this
# need to handle llama 3 separately
if "lama-3" in base_model_name:
print("load llama-3 tokenizer")
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
else:
tokenizer = LlamaTokenizer.from_pretrained(base_model_name, legacy=True)
else:
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
model = IbaXs_LlamaForCausalLM(config=config).to(DEVICE)
model.eval()
prompts = [
"The capital of France is",
#"Here is a simple Python function to add two numbers:"
]
for i, prompt in enumerate(prompts):
print(f"\n--- Prompt {i+1} ---")
print(f"Input: {prompt}")
# 4.1. Tokenize the Input
# Convert the prompt string to PyTorch tensors
inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
# 4.2. Generate Text
# Use torch.no_grad() for inference
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=4, # Generate up to 50 new tokens
do_sample=True,
temperature=0.7,
top_k=50
# Note: We don't need 'add_generation_prompt' here
)
# 4.3. Decode the Output
# The output includes the prompt, so we slice it
output_tokens = outputs[0][inputs["input_ids"].shape[1]:]
generated_text = tokenizer.decode(output_tokens, skip_special_tokens=True)
print(f"Output: {generated_text}")
def trainIBA(config, main_cfg):
training_cfg = main_cfg.training
data_cfg = main_cfg.data
valid_hf_arg_names = set(inspect.signature(TrainingArguments).parameters.keys())
training_config_dict = OmegaConf.to_container(
training_cfg, resolve=True
)
filtered_training_args_dict = {
key: value for key, value in training_config_dict.items()
if key in valid_hf_arg_names
}
trainer_args = TrainingArguments(**filtered_training_args_dict)
gradient_accumulation_steps = training_cfg.batch_size // training_cfg.per_device_train_batch_size
device_map = "auto"
world_size = int(os.environ.get("WORLD_SIZE", 1))
ddp = world_size != 1
if ddp:
device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
gradient_accumulation_steps = gradient_accumulation_steps // world_size
base_model_name = main_cfg.model.base_model_name
# model = AutoModelForCausalLM.from_pretrained(
# base_model_name,
# load_in_8bit=False,
# torch_dtype=torch.float,
# device_map={"": int(os.environ.get("LOCAL_RANK") or 0)},
# trust_remote_code=True,
# )
model = IbaXs_LlamaForCausalLM(config=config).to(DEVICE) # test
# model = LlamaForCausalLM(config=config).to('mps')
# model = IbaXs_LlamaModel.from_pretrained(main_cfg.model.base_model_name, config=config,
# torch_dtype=torch.float, device_map={"": int(os.environ.get("LOCAL_RANK") or 0)},
# trust_remote_code=True)
base_model_name = main_cfg.model.base_model_name
if config.model_type == 'llama':
# Due to the name of transformers' LlamaTokenizer, we have to do this
# need to handle llama 3 separately
if "lama-3" in base_model_name:
print("load llama-3 tokenizer")
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
else:
tokenizer = LlamaTokenizer.from_pretrained(base_model_name, legacy=True)
else:
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token_id = (
0 # unk. we want this to be different from the eos token
)
tokenizer.padding_side = "left" # Allow batched inference
def tokenize(prompt, max_length=training_cfg.cutoff_len, add_eos_token=True):
result = tokenizer(
prompt,
truncation=True,
max_length=training_cfg.cutoff_len,
padding=False,
return_tensors=None,
)
if (
result["input_ids"][-1] != tokenizer.eos_token_id
and len(result["input_ids"]) < max_length
and add_eos_token
):
result["input_ids"].append(tokenizer.eos_token_id)
if "chatglm" not in base_model_name:
result["attention_mask"].append(1)
result["labels"] = result["input_ids"].copy()
if "chatglm" in base_model_name:
return {"input_ids": result["input_ids"], "labels": result["labels"]}
else:
return result
def generate_and_tokenize_prompt(data_point):
full_prompt = generate_prompt(data_point)
tokenized_full_prompt = tokenize(full_prompt)
if not training_cfg.train_on_inputs:
user_prompt = generate_prompt({**data_point, "output": ""})
tokenized_user_prompt = tokenize(user_prompt, add_eos_token=False)
user_prompt_len = len(tokenized_user_prompt["input_ids"])
tokenized_full_prompt["labels"] = [
-100
] * user_prompt_len + tokenized_full_prompt["labels"][
user_prompt_len:
] # could be sped up, probably
return tokenized_full_prompt
# print('model', model)
if data_cfg.data_path.endswith(".json"):
data = load_dataset("json", data_files=data_cfg.data_path)
else:
data = load_dataset(data_cfg.data_path)
### Check later
if training_cfg.resume_from_checkpoint:
# Check the available weights and load them
checkpoint_name = os.path.join(
resume_from_checkpoint, "pytorch_model.bin"
) # Full checkpoint
if not os.path.exists(checkpoint_name):
checkpoint_name = os.path.join(
resume_from_checkpoint, "adapter_model.bin"
) # only LoRA model - LoRA config above has to fit
resume_from_checkpoint = (
False # So the trainer won't try loading its state
)
# The two files above have a different name depending on how they were saved, but are actually the same.
if os.path.exists(checkpoint_name):
print(f"Restarting from {checkpoint_name}")
model = IbaXs_LlamaModel.from_pretrained("./my-saved-model")
else:
print(f"Checkpoint {checkpoint_name} not found")
# model.print_trainable_parameters()
if training_cfg.val_set_size > 0:
train_val = data["train"].train_test_split(
test_size=training_cfg.val_set_size, shuffle=True, seed=42
)
train_data = (
train_val["train"].map(generate_and_tokenize_prompt, num_proc=8)
)
val_data = (
train_val["test"].map(generate_and_tokenize_prompt)
)
else:
train_data = data["train"].shuffle().map(generate_and_tokenize_prompt, num_proc=8)
val_data = None
# print('val data', type(val_data), val_data)
# for k,v in val_data[0].items():
# print('kv', k, ': ', v)
# count_parameters(model)
trainer = transformers.Trainer(
model=model,
train_dataset=train_data,
eval_dataset=val_data,
args=trainer_args,
data_collator=transformers.DataCollatorForSeq2Seq(
tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
),
)
model.config.use_cache = False
trainer.train(resume_from_checkpoint=training_cfg.resume_from_checkpoint)
def generate_prompt(data_point):
# sorry about the formatting disaster gotta move fast
if data_point["input"]:
return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{data_point["instruction"]}
### Input:
{data_point["input"]}
### Response:
{data_point["output"]}""" # noqa: E501
else:
return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{data_point["instruction"]}
### Response:
{data_point["output"]}""" # noqa: E501
@hydra.main(config_path="../conf_hydra", config_name="config", version_base='1.3')
def main(main_cfg: MainConfig):
#
main_cfg = OmegaConf.merge(OmegaConf.structured(MainConfig), main_cfg)
# print('Hello,', main_cfg)
# print(OmegaConf.to_yaml(main_cfg))
# cfg_dict = asdict(main_cfg)
main_cfg_dict = OmegaConf.to_container(main_cfg, resolve=True)
# print(yaml.dump(cfg_dict, indent=2, default_flow_style=False))
config = AutoConfig.from_pretrained(
main_cfg.model.base_model_name
)
# print(config)
config.hidden_size=128
config.intermediate_size=290
config.num_hidden_layers=3
# config._attn_implementation = "eager"
config.head_dim = config.hidden_size // config.num_attention_heads
# main_cfg_dict = asdict(main_cfg)
config.main_cfg = main_cfg_dict
set_seed(main_cfg.seed)
trainIBA(config, main_cfg)
if __name__ == "__main__":
main()