ChatTime / training /pretrain.py
a12354's picture
Add files using upload-large-folder tool
7d22995 verified
Raw
History Blame Contribute Delete
6.48 kB
import argparse
import sys
import numpy as np
import torch
from datasets import load_dataset
from transformers import TrainingArguments, LlamaTokenizer
from trl import SFTTrainer
from unsloth import FastLanguageModel, is_bfloat16_supported
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--code_path", type=str, required=True, default=None)
parser.add_argument("--model_path", type=str, required=True, default=None)
parser.add_argument("--dataset_path", type=str, required=True, default=None)
parser.add_argument("--log_path", type=str, required=True, default=None)
parser.add_argument("--output_path", type=str, required=True, default=None)
parser.add_argument("--max_seq_length", type=int, default=2048)
parser.add_argument("--load_in_4bit", action="store_true", default=False)
parser.add_argument("--lora_rank", type=int, default=16)
parser.add_argument("--lora_alpha", type=int, default=16)
parser.add_argument("--lora_dropout", type=float, default=0.00)
parser.add_argument("--random_seed", type=int, default=3407)
parser.add_argument("--num_train_epochs", type=int, default=1)
parser.add_argument("--per_device_train_batch_size", type=int, default=64)
parser.add_argument("--gradient_accumulation_steps", type=int, default=2)
parser.add_argument("--save_steps", type=int, default=2)
parser.add_argument("--logging_steps", type=int, default=2)
parser.add_argument("--max_steps", type=int, default=-1)
parser.add_argument("--low_limit", type=float, default=-1)
parser.add_argument("--high_limit", type=float, default=1)
parser.add_argument("--n_tokens", type=int, default=10002)
parser.add_argument("--prec", type=int, default=4)
parser.add_argument("--time_sep", type=str, default=" ")
parser.add_argument("--time_flag", type=str, default="###")
parser.add_argument("--nan_flag", type=str, default="Nan")
args = parser.parse_args()
sys.path.append(args.code_path)
from utils.tools import Discretizer, Serializer
# construct vocabulary
discretizer = Discretizer(low_limit=args.low_limit, high_limit=args.high_limit, n_tokens=args.n_tokens)
serializer = Serializer(prec=args.prec, time_sep=args.time_sep, time_flag=args.time_flag, nan_flag=args.nan_flag)
vocabulary = np.concatenate((discretizer.centers[1:-1], [np.NaN])).reshape(-1, 1)
vocabulary = np.array([serializer.serialize(i) for i in vocabulary])
print(f"\nVocabulary: \n{vocabulary}\n")
# add token to llama tokenizer
tokenizer = LlamaTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
print(f"Old model pieces: {len(tokenizer.get_vocab())}")
tokenizer.add_tokens(vocabulary.tolist())
print(f"New model pieces: {len(tokenizer.get_vocab())}")
EOS_TOKEN = tokenizer.eos_token
# load model
model, _ = FastLanguageModel.from_pretrained(
model_name=args.model_path,
max_seq_length=args.max_seq_length,
dtype=None,
load_in_4bit=args.load_in_4bit,
resize_model_vocab=len(tokenizer.get_vocab()),
)
# add lora to llama model
model = FastLanguageModel.get_peft_model(
model,
r=args.lora_rank,
lora_alpha=args.lora_alpha,
lora_dropout=args.lora_dropout,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", ],
modules_to_save=["embed_tokens", "lm_head", ],
bias="none",
use_gradient_checkpointing="unsloth",
random_state=args.random_seed,
max_seq_length=args.max_seq_length,
)
# load dataset
def formatting_func(example):
return example["text"] + EOS_TOKEN
print(f"\nLoading dataset in {args.dataset_path}")
dataset = load_dataset(args.dataset_path, split="train")
print(f"Dataset example: \n{dataset[0]['text']}\n")
# train model
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset,
dataset_text_field="text",
max_seq_length=args.max_seq_length,
dataset_num_proc=64,
packing=False,
formatting_func=formatting_func,
args=TrainingArguments(
per_device_train_batch_size=args.per_device_train_batch_size,
gradient_accumulation_steps=args.gradient_accumulation_steps,
num_train_epochs=args.num_train_epochs,
weight_decay=0.01,
warmup_ratio=0.05,
max_grad_norm=1.0,
learning_rate=2e-4,
logging_strategy="steps",
logging_steps=args.logging_steps,
save_strategy="steps",
save_steps=args.save_steps,
max_steps=args.max_steps,
save_total_limit=1,
logging_first_step=True,
optim="adamw_8bit",
lr_scheduler_type="cosine",
seed=args.random_seed,
output_dir=args.log_path,
fp16=not is_bfloat16_supported(),
bf16=is_bfloat16_supported(),
),
)
# title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"\nGPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.\n")
trainer_stats = trainer.train()
# title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"\n{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime'] / 60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.\n")
# save model and tokenizer
model.save_pretrained_merged(args.output_path, tokenizer)