| | import hydra |
| | from hydra.core.config_store import ConfigStore |
| | from omegaconf import OmegaConf, DictConfig |
| | import torch |
| | import yaml |
| | from dataclasses import asdict |
| |
|
| | from datasets import load_dataset |
| |
|
| | import os |
| | import transformers |
| | from transformers import (AutoModelForCausalLM, AutoTokenizer, |
| | LlamaTokenizer, AutoModel, AutoConfig, |
| | TrainingArguments) |
| | import inspect |
| |
|
| | import random |
| | import numpy as np |
| |
|
| | |
| | |
| | |
| | from iba import IbaXs_LlamaModel, IbaXs_LlamaForCausalLM, count_parameters, MainConfig |
| |
|
| | from transformers.models.llama.modeling_llama import ( |
| | LlamaMLP, |
| | LlamaAttention, |
| | LlamaDecoderLayer, |
| | LlamaModel, |
| | LlamaForCausalLM |
| | ) |
| |
|
| | |
| | cs = ConfigStore.instance() |
| |
|
| | |
| | |
| | cs.store(name="main_schema", node=MainConfig) |
| |
|
| | DEVICE = 'cuda' |
| |
|
| | def set_seed(seed: int): |
| | random.seed(seed) |
| | np.random.seed(seed) |
| | torch.manual_seed(seed) |
| | torch.cuda.manual_seed_all(seed) |
| | transformers.set_seed(seed) |
| |
|
| | def test_generate(config, main_cfg): |
| | |
| | base_model_name = main_cfg.model.base_model_name |
| | if config.model_type == 'llama': |
| | |
| | |
| | if "lama-3" in base_model_name: |
| | print("load llama-3 tokenizer") |
| | tokenizer = AutoTokenizer.from_pretrained(base_model_name) |
| | else: |
| | tokenizer = LlamaTokenizer.from_pretrained(base_model_name, legacy=True) |
| | else: |
| | tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True) |
| | |
| | model = IbaXs_LlamaForCausalLM(config=config).to(DEVICE) |
| | model.eval() |
| | prompts = [ |
| | "The capital of France is", |
| | |
| | ] |
| | for i, prompt in enumerate(prompts): |
| | print(f"\n--- Prompt {i+1} ---") |
| | print(f"Input: {prompt}") |
| |
|
| | |
| | |
| | inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE) |
| |
|
| | |
| | |
| | with torch.no_grad(): |
| | outputs = model.generate( |
| | **inputs, |
| | max_new_tokens=4, |
| | do_sample=True, |
| | temperature=0.7, |
| | top_k=50 |
| | |
| | ) |
| |
|
| | |
| | |
| | output_tokens = outputs[0][inputs["input_ids"].shape[1]:] |
| | generated_text = tokenizer.decode(output_tokens, skip_special_tokens=True) |
| |
|
| | print(f"Output: {generated_text}") |
| |
|
| | def trainIBA(config, main_cfg): |
| | training_cfg = main_cfg.training |
| | data_cfg = main_cfg.data |
| |
|
| | valid_hf_arg_names = set(inspect.signature(TrainingArguments).parameters.keys()) |
| | training_config_dict = OmegaConf.to_container( |
| | training_cfg, resolve=True |
| | ) |
| | filtered_training_args_dict = { |
| | key: value for key, value in training_config_dict.items() |
| | if key in valid_hf_arg_names |
| | } |
| | trainer_args = TrainingArguments(**filtered_training_args_dict) |
| |
|
| | gradient_accumulation_steps = training_cfg.batch_size // training_cfg.per_device_train_batch_size |
| |
|
| | device_map = "auto" |
| | world_size = int(os.environ.get("WORLD_SIZE", 1)) |
| | ddp = world_size != 1 |
| | if ddp: |
| | device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} |
| | gradient_accumulation_steps = gradient_accumulation_steps // world_size |
| |
|
| | base_model_name = main_cfg.model.base_model_name |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | model = IbaXs_LlamaForCausalLM(config=config).to(DEVICE) |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | base_model_name = main_cfg.model.base_model_name |
| | if config.model_type == 'llama': |
| | |
| | |
| | if "lama-3" in base_model_name: |
| | print("load llama-3 tokenizer") |
| | tokenizer = AutoTokenizer.from_pretrained(base_model_name) |
| | else: |
| | tokenizer = LlamaTokenizer.from_pretrained(base_model_name, legacy=True) |
| | else: |
| | tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True) |
| |
|
| | tokenizer.pad_token_id = ( |
| | 0 |
| | ) |
| | tokenizer.padding_side = "left" |
| |
|
| | def tokenize(prompt, max_length=training_cfg.cutoff_len, add_eos_token=True): |
| | result = tokenizer( |
| | prompt, |
| | truncation=True, |
| | max_length=training_cfg.cutoff_len, |
| | padding=False, |
| | return_tensors=None, |
| | ) |
| | if ( |
| | result["input_ids"][-1] != tokenizer.eos_token_id |
| | and len(result["input_ids"]) < max_length |
| | and add_eos_token |
| | ): |
| | result["input_ids"].append(tokenizer.eos_token_id) |
| | if "chatglm" not in base_model_name: |
| | result["attention_mask"].append(1) |
| |
|
| | result["labels"] = result["input_ids"].copy() |
| |
|
| | if "chatglm" in base_model_name: |
| | return {"input_ids": result["input_ids"], "labels": result["labels"]} |
| | else: |
| | return result |
| |
|
| | def generate_and_tokenize_prompt(data_point): |
| | full_prompt = generate_prompt(data_point) |
| | tokenized_full_prompt = tokenize(full_prompt) |
| | |
| | if not training_cfg.train_on_inputs: |
| | user_prompt = generate_prompt({**data_point, "output": ""}) |
| | tokenized_user_prompt = tokenize(user_prompt, add_eos_token=False) |
| | user_prompt_len = len(tokenized_user_prompt["input_ids"]) |
| |
|
| | tokenized_full_prompt["labels"] = [ |
| | -100 |
| | ] * user_prompt_len + tokenized_full_prompt["labels"][ |
| | user_prompt_len: |
| | ] |
| | return tokenized_full_prompt |
| | |
| |
|
| | if data_cfg.data_path.endswith(".json"): |
| | data = load_dataset("json", data_files=data_cfg.data_path) |
| | else: |
| | data = load_dataset(data_cfg.data_path) |
| |
|
| | |
| | if training_cfg.resume_from_checkpoint: |
| | |
| | checkpoint_name = os.path.join( |
| | resume_from_checkpoint, "pytorch_model.bin" |
| | ) |
| | if not os.path.exists(checkpoint_name): |
| | checkpoint_name = os.path.join( |
| | resume_from_checkpoint, "adapter_model.bin" |
| | ) |
| | resume_from_checkpoint = ( |
| | False |
| | ) |
| | |
| | if os.path.exists(checkpoint_name): |
| | print(f"Restarting from {checkpoint_name}") |
| | model = IbaXs_LlamaModel.from_pretrained("./my-saved-model") |
| | else: |
| | print(f"Checkpoint {checkpoint_name} not found") |
| |
|
| | |
| |
|
| | if training_cfg.val_set_size > 0: |
| | train_val = data["train"].train_test_split( |
| | test_size=training_cfg.val_set_size, shuffle=True, seed=42 |
| | ) |
| | train_data = ( |
| | train_val["train"].map(generate_and_tokenize_prompt, num_proc=8) |
| | ) |
| | val_data = ( |
| | train_val["test"].map(generate_and_tokenize_prompt) |
| | ) |
| | else: |
| | train_data = data["train"].shuffle().map(generate_and_tokenize_prompt, num_proc=8) |
| | val_data = None |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | trainer = transformers.Trainer( |
| | model=model, |
| | train_dataset=train_data, |
| | eval_dataset=val_data, |
| | args=trainer_args, |
| | data_collator=transformers.DataCollatorForSeq2Seq( |
| | tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True |
| | ), |
| | ) |
| | model.config.use_cache = False |
| |
|
| | trainer.train(resume_from_checkpoint=training_cfg.resume_from_checkpoint) |
| |
|
| |
|
| |
|
| | def generate_prompt(data_point): |
| | |
| | if data_point["input"]: |
| | return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. |
| | |
| | ### Instruction: |
| | {data_point["instruction"]} |
| | |
| | ### Input: |
| | {data_point["input"]} |
| | |
| | ### Response: |
| | {data_point["output"]}""" |
| | else: |
| | return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request. |
| | |
| | ### Instruction: |
| | {data_point["instruction"]} |
| | |
| | ### Response: |
| | {data_point["output"]}""" |
| | |
| | @hydra.main(config_path="../conf_hydra", config_name="config", version_base='1.3') |
| | def main(main_cfg: MainConfig): |
| | |
| | main_cfg = OmegaConf.merge(OmegaConf.structured(MainConfig), main_cfg) |
| | |
| | |
| | |
| | main_cfg_dict = OmegaConf.to_container(main_cfg, resolve=True) |
| | |
| | config = AutoConfig.from_pretrained( |
| | main_cfg.model.base_model_name |
| | ) |
| | |
| | config.hidden_size=128 |
| | config.intermediate_size=290 |
| | config.num_hidden_layers=3 |
| | |
| | config.head_dim = config.hidden_size // config.num_attention_heads |
| |
|
| | |
| | config.main_cfg = main_cfg_dict |
| | set_seed(main_cfg.seed) |
| | trainIBA(config, main_cfg) |
| |
|
| | |
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|