| import yaml |
| import draccus |
|
|
| from typing import List, Tuple |
| from dataclasses import field, dataclass, asdict |
|
|
| from .config import MainConfig, convert_to_trainer_args |
|
|
| import random |
| import numpy as np |
| import torch |
| import transformers |
|
|
| import wandb |
| from datasets import load_dataset |
| import os |
| import json |
| from datetime import datetime |
|
|
| import torch |
| import torch.optim as optim |
| from typing import Sequence, Literal, Dict |
| from torch.nn.utils.rnn import pad_sequence |
|
|
| from transformers import ( |
| AutoModelForCausalLM, AutoTokenizer, |
| Trainer, |
| set_seed, |
| get_linear_schedule_with_warmup, |
| get_cosine_schedule_with_warmup, |
| ) |
|
|
| import copy |
|
|
| from smpeft.sama import SamaConfig |
| from smpeft import get_peft_model, PeftModel |
| from .utils import trainable_parameters_to_file, set_seed_all |
| import warnings |
|
|
| |
| warnings.filterwarnings("ignore", category=FutureWarning, module='torch._inductor.lowering') |
| warnings.filterwarnings("ignore", message=".*Online softmax is disabled on the fly.*", category=UserWarning) |
|
|
| warnings.filterwarnings("ignore", message=".*Our suggested max number of worker in current system is 1.*", category=UserWarning) |
| warnings.filterwarnings("ignore", message=".*will be initialized from a multivariate normal distribution.*") |
| warnings.filterwarnings("ignore", message=".*that differ from the model config and generation config.*", category=UserWarning) |
| warnings.filterwarnings("ignore", message=".*torch.backends.cudnn.conv.fp32_precision = 'tf32' or torch..*", category=UserWarning) |
|
|
| IGNORE_INDEX=-100 |
|
|
| PROMPT_TEMPLATE = ( |
| "Below is an passage followed by a coresponding question that describes a task " |
| "Write a response that appropriately completes the request with your answer.\n\n" |
| "### Instruction:\n{instruction}\n\n### Response:" |
| ) |
|
|
| def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict: |
| """ |
| Tokenize a list of strings. |
| Modified to return lists (not tensors) for better compatibility with HF dataset.map(). |
| """ |
| tokenized_list = [ |
| tokenizer( |
| text, |
| return_tensors=None, |
| padding=False, |
| max_length=tokenizer.model_max_length, |
| truncation=True, |
| ) |
| for text in strings |
| ] |
| |
| input_ids = [tokenized['input_ids'] for tokenized in tokenized_list] |
| |
| |
| input_ids_lens = [len(x) for x in input_ids] |
| |
| return dict( |
| input_ids=input_ids, |
| labels=input_ids, |
| input_ids_lens=input_ids_lens, |
| ) |
|
|
| def preprocess( |
| sources: Sequence[str], |
| targets: Sequence[str], |
| tokenizer: transformers.PreTrainedTokenizer, |
| ) -> Dict: |
| """ |
| Preprocess the data by tokenizing and masking the source (instruction). |
| """ |
| |
| examples = [s + t for s, t in zip(sources, targets)] |
| |
| |
| examples_tokenized = _tokenize_fn(examples, tokenizer) |
| sources_tokenized = _tokenize_fn(sources, tokenizer) |
|
|
| input_ids = examples_tokenized["input_ids"] |
| labels = copy.deepcopy(input_ids) |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| for i, source_len in enumerate(sources_tokenized["input_ids_lens"]): |
| |
| |
| |
| mask_len = min(source_len, len(labels[i])) |
| |
| |
| labels[i][:mask_len] = [IGNORE_INDEX] * mask_len |
| |
| |
| |
|
|
| return dict(input_ids=input_ids, labels=labels) |
|
|
| def train_tokenize_function(examples, tokenizer): |
| """ |
| Adaptation for MetaMathQA (395k) dataset structure. |
| MetaMath usually has columns: 'query', 'response', 'type', etc. |
| """ |
| sources = [] |
| targets = [] |
| |
| |
| |
| for query, response in zip(examples['query'], examples['response']): |
| |
| |
| |
| source_text = PROMPT_TEMPLATE.format_map(dict(instruction=query)) |
| sources.append(source_text) |
|
|
| |
| |
| target_text = f"{response}{tokenizer.eos_token}" |
| targets.append(target_text) |
|
|
| |
| data_dict = preprocess(sources, targets, tokenizer) |
| return data_dict |
|
|
|
|
| @dataclass |
| class DataCollatorForSupervisedDataset(): |
| tokenizer: transformers.PreTrainedTokenizer |
| max_length: int = field(default=512) |
| mode: str = field(default="fixed") |
|
|
| def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: |
| |
| |
| input_ids_list = [torch.tensor(x["input_ids"], dtype=torch.long) for x in instances] |
| labels_list = [torch.tensor(x["labels"], dtype=torch.long) for x in instances] |
|
|
| |
| if self.mode == "dynamic": |
| |
| |
| batch_max_len = max([len(x) for x in input_ids_list]) |
| target_len = min(batch_max_len, self.max_length) |
| else: |
| |
| target_len = self.max_length |
|
|
| |
| def pad_and_truncate(tensors, padding_value): |
| |
| padded = pad_sequence(tensors, batch_first=True, padding_value=padding_value) |
| |
| |
| curr_len = padded.shape[1] |
| if curr_len > target_len: |
| |
| return padded[:, :target_len] |
| elif curr_len < target_len: |
| |
| diff = target_len - curr_len |
| padding = torch.full((padded.shape[0], diff), padding_value, dtype=padded.dtype) |
| return torch.cat([padded, padding], dim=1) |
| else: |
| return padded |
|
|
| |
| |
| if self.tokenizer.pad_token_id is None: |
| raise ValueError("Tokenizer.pad_token_id is None. Please set it to eos_token_id or unk_token_id.") |
| |
| input_ids = pad_and_truncate(input_ids_list, self.tokenizer.pad_token_id) |
| labels = pad_and_truncate(labels_list, IGNORE_INDEX) |
|
|
| |
| |
| attention_mask = input_ids.ne(self.tokenizer.pad_token_id).long() |
|
|
| return { |
| "input_ids": input_ids, |
| "labels": labels, |
| "attention_mask": attention_mask |
| } |
| |
| @draccus.wrap() |
| def main(mainCfg: MainConfig): |
| print('='*120) |
| set_seed_all(mainCfg.seed) |
| |
| |
| |
| training_args = convert_to_trainer_args(mainCfg) |
| |
|
|
| |
|
|
| task_name = mainCfg.data.dataset_name |
| |
| ENTITY = "nvan-13-korea-university" |
| PROJECT = os.environ.get("WANDB_PROJECT") |
| api = wandb.Api() |
| try: |
| runs_list = api.runs(f"{ENTITY}/{PROJECT}") |
| next_run_num = len(runs_list) + 1 |
| except Exception as e: |
| next_run_num = 1 |
| |
| model = AutoModelForCausalLM.from_pretrained(mainCfg.model.model_name, |
| device_map="auto", low_cpu_mem_usage=True, |
| dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16, |
| |
| ) |
| total_params_now = sum(p.numel() for p in model.parameters()) |
| print(f'#params of the pretrained model, {total_params_now:,}') |
| |
| |
| if mainCfg.model.adapter_path is not None: |
| print('___ Loading from: ', mainCfg.model.adapter_path) |
| model = PeftModel.from_pretrained(model, mainCfg.model.adapter_path, is_trainable = True) |
| elif mainCfg.sama_adapter.col_L is not None: |
| sama_adapter_config = asdict(mainCfg.sama_adapter) |
| |
| |
| for adapter_name in mainCfg.data.adapter_names: |
| print("Init from Sama Config:", json.dumps(sama_adapter_config, indent=4, sort_keys=True)) |
| sama_config = SamaConfig(**sama_adapter_config) |
| model = get_peft_model(model, sama_config, adapter_name=adapter_name) |
| |
| else: |
| print("Full Parameter Fine-Tuning") |
|
|
| model.print_trainable_parameters() |
| |
| |
| |
| |
| sama_trainable_layers = filter( |
| lambda p: p.requires_grad, model.parameters() |
| ) |
| |
| tokenizer = AutoTokenizer.from_pretrained( |
| mainCfg.model.model_name, |
| model_max_length=mainCfg.model.model_max_seq_length, |
| padding_side="right", |
| use_fast=True, |
| ) |
| |
| if tokenizer.pad_token is None: |
| if tokenizer.unk_token_id is not None: |
| tokenizer.pad_token_id = tokenizer.unk_token_id |
| tokenizer.pad_token = tokenizer.unk_token |
| print("Set PAD token to UNK token.") |
| elif tokenizer.eos_token_id is not None: |
| tokenizer.pad_token_id = tokenizer.eos_token_id |
| tokenizer.pad_token = tokenizer.eos_token |
| print("Set PAD token to EOS token.") |
|
|
| if model is not None: |
| model.config.pad_token_id = tokenizer.pad_token_id |
| if model.config.pad_token_id != tokenizer.pad_token_id: |
| raise ValueError("Failed to sync pad_token_id between tokenizer and model config") |
| |
| metamathqa_train = load_dataset(path=mainCfg.data.path, split=mainCfg.data.dataset_split) |
| metamathqa_valid = load_dataset(path=mainCfg.data.path, split='train[20000:20256]') |
| |
| train_dataset = metamathqa_train.map( |
| train_tokenize_function, |
| batched=True, |
| batch_size=20000, |
| num_proc=32, |
| remove_columns=metamathqa_train.column_names, |
| load_from_cache_file=True, |
| desc="Running tokenizer on train dataset", |
| fn_kwargs={"tokenizer": tokenizer} |
| ) |
| dev_dataset = metamathqa_valid.map( |
| train_tokenize_function, |
| batched=True, |
| batch_size=20000, |
| num_proc=32, |
| load_from_cache_file=True, |
| remove_columns=metamathqa_valid.column_names, |
| fn_kwargs={"tokenizer": tokenizer} |
| ) |
| print('- Train dataset size: ', len(train_dataset)) |
| print('- Dev dataset size: ', len(dev_dataset)) |
| |
| data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer, max_length=mainCfg.model.model_max_seq_length, |
| |
| ) |
| data_module = dict(train_dataset=train_dataset, data_collator=data_collator, eval_dataset=dev_dataset) |
|
|
| optimizer = optim.AdamW( |
| sama_trainable_layers, |
| lr=mainCfg.trainer_args.learning_rate, |
| eps=1e-8 |
| ) |
| |
| |
| |
| |
| |
| |
| num_devices = training_args.n_gpu if training_args.n_gpu > 0 else 1 |
| per_device_train_batch_size = training_args.per_device_train_batch_size |
| gradient_accumulation_steps = training_args.gradient_accumulation_steps |
|
|
| |
| total_train_batch_size = per_device_train_batch_size * num_devices * gradient_accumulation_steps |
| |
| |
| num_update_steps_per_epoch = len(train_dataset) // total_train_batch_size |
| max_steps = int(training_args.num_train_epochs * num_update_steps_per_epoch) |
|
|
| print(f"___ Estimated Total Training Steps: {max_steps}") |
| |
| if training_args.lr_scheduler_type == "cosine": |
| lr_scheduler = get_cosine_schedule_with_warmup( |
| optimizer=optimizer, |
| num_warmup_steps=training_args.warmup_steps, |
| num_training_steps=max_steps, |
| ) |
| else: |
| |
| lr_scheduler = get_linear_schedule_with_warmup( |
| optimizer=optimizer, |
| num_warmup_steps=training_args.warmup_steps, |
| num_training_steps=max_steps, |
| ) |
| start_time = datetime.now() |
| date_str = start_time.strftime("%y%m%dd%Hh%Mm%S")[1:] |
| output_dir = f'{training_args.output_dir}/{task_name}/'\ |
| f't{date_str},' \ |
| f'mlr{training_args.learning_rate:.1e},'\ |
| f'b{mainCfg.trainer_args.per_device_train_batch_size},{mainCfg.trainer_args.gradient_accumulation_steps},'\ |
| f'nb{mainCfg.sama_adapter.num_unique_blocks_L},{mainCfg.sama_adapter.num_unique_blocks_R},'\ |
| f'cL{mainCfg.sama_adapter.col_L},'\ |
| f'rR{mainCfg.sama_adapter.row_R},s{mainCfg.sama_adapter.scaling},'\ |
| f'init{mainCfg.run_text},dr{mainCfg.sama_adapter.drop_out},'\ |
| f'ep{training_args.num_train_epochs}' \ |
| |
| print('out', type(output_dir), output_dir) |
| |
| trainable_parameters_to_file(model, output_dir) |
|
|
| training_args.output_dir=output_dir |
| print(f'Current output_dir: {output_dir}') |
| training_args.run_name = f'[{next_run_num}-{task_name}]mlr{training_args.learning_rate:.1e},'\ |
| f'b{mainCfg.trainer_args.per_device_train_batch_size},{mainCfg.trainer_args.gradient_accumulation_steps},'\ |
| f'nb{mainCfg.sama_adapter.num_unique_blocks_L},{mainCfg.sama_adapter.num_unique_blocks_R},'\ |
| f'cL{mainCfg.sama_adapter.col_L},'\ |
| f'rR{mainCfg.sama_adapter.row_R},s{mainCfg.sama_adapter.scaling},'\ |
| f'init{mainCfg.run_text},dr{mainCfg.sama_adapter.drop_out}' \ |
| f'ep{training_args.num_train_epochs}' \ |
| f't{date_str}' |
| print('out', type(training_args.run_name), training_args.run_name) |
| print(f'data: {task_name}, train: {len(train_dataset)}, valid: {len(dev_dataset)}') |
| |
| from .utils import ExperimentMonitorCallback |
| monitor = ExperimentMonitorCallback( |
| log_file_path="./training_metrics_bs8.json", |
| run_name="Experiment_BatchSize_8", |
| log_interval=20 |
| ) |
| |
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| |
| processing_class=tokenizer, |
| optimizers=(optimizer, lr_scheduler), |
| **data_module, |
| |
| ) |
| |
| model.config.use_cache = False |
| trainer.train() |
| |
| end_time = datetime.now() |
| print('end time: ', end_time.strftime("%Y-%m-%d %H:%M:%S"), '| duration: ', end_time - start_time) |
| |
| tokenizer.save_pretrained(os.path.join(training_args.output_dir, 'ft')) |
| trainer.save_state() |
| model.peft_config.save_pretrained(os.path.join(training_args.output_dir, 'ft')) |
| model.save_pretrained(os.path.join(training_args.output_dir, 'ft2')) |
| |
| if __name__ == "__main__": |
| main() |