| # 处理方法 | |
| - 把训练集SolRet_training的query和value做对比学习 | |
| # 参考代码 | |
| ```python | |
| ########################################################################### imports | |
| import os | |
| import json | |
| # os.environ['CUDA_VISIBLE_DEVICES'] = '0,1' | |
| import argparse | |
| import random | |
| import math | |
| from time import time | |
| import numpy as np | |
| import torch | |
| from torch.utils.data import DataLoader | |
| from torch.optim import AdamW | |
| from torch.optim.lr_scheduler import LambdaLR | |
| from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer | |
| from transformers.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM as Qwen2ForCausalLMOrig | |
| from transformers.models.qwen2.modeling_qwen2 import QWEN2_INPUTS_DOCSTRING,_CONFIG_FOR_DOC | |
| from transformers.utils import add_start_docstrings_to_model_forward, replace_return_docstrings | |
| from typing import List, Optional, Tuple, Union | |
| from torch import nn | |
| from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss | |
| from torch.nn import functional as F | |
| from transformers.modeling_outputs import CausalLMOutputWithPast | |
| from datasets import load_dataset | |
| import deepspeed | |
| from peft import LoraConfig, get_peft_model, PeftModel | |
| from functools import partial | |
| import pynvml | |
| class print_time: | |
| def __init__(self, *desc): | |
| self.desc = desc | |
| def __enter__(self): | |
| print(*self.desc) | |
| self.t = time() | |
| def __exit__(self, type, value, traceback): | |
| print(f'{time()-self.t:.02f}s') | |
| # 指定 gpu | |
| def set_gpus(gpu): | |
| torch.cuda.set_device(gpu) | |
| def set_seed(seed): | |
| os.environ['PYTHONHASHSEED'] = str(seed) | |
| random.seed(seed) | |
| np.random.seed(seed) | |
| torch.manual_seed(seed) | |
| if torch.cuda.is_available(): | |
| torch.cuda.manual_seed(seed) | |
| torch.cuda.manual_seed_all(seed) | |
| def set_cuda(deterministic=True): | |
| if torch.cuda.is_available(): | |
| # 卷积算法确定 | |
| torch.backends.cudnn.deterministic = deterministic | |
| # 设置 torch.backends.cudnn.benchmark=True 将会让程序在开始时花费一点额外时间,为整个网络的每个卷积层搜索最适合它的卷积实现算法,进而实现网络的加速。适用场景是网络结构固定(不是动态变化的),网络的输入形状(包括 batch size,图片大小,输入的通道)是不变的,其实也就是一般情况下都比较适用。反之,如果卷积层的设置一直变化,将会导致程序不停地做优化,反而会耗费更多的时间。 | |
| torch.backends.cudnn.benchmark = not deterministic | |
| class Qwen2ForCausalLM(Qwen2ForCausalLMOrig): | |
| def __init__(self, config): | |
| super().__init__(config) | |
| retrieve_size=256 | |
| self.retrieve_proj = nn.Linear(config.hidden_size, retrieve_size, bias=False) | |
| @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING) | |
| @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) | |
| def forward( | |
| self, | |
| input_ids: torch.LongTensor = None, | |
| attention_mask: Optional[torch.Tensor] = None, | |
| position_ids: Optional[torch.LongTensor] = None, | |
| past_key_values: Optional[List[torch.FloatTensor]] = None, | |
| inputs_embeds: Optional[torch.FloatTensor] = None, | |
| labels: Optional[torch.LongTensor] = None, | |
| use_cache: Optional[bool] = None, | |
| output_attentions: Optional[bool] = None, | |
| output_hidden_states: Optional[bool] = None, | |
| return_dict: Optional[bool] = None, | |
| cache_position: Optional[torch.LongTensor] = None, | |
| ) -> Union[Tuple, CausalLMOutputWithPast]: | |
| r""" | |
| Args: | |
| labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): | |
| Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., | |
| config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored | |
| (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. | |
| Returns: | |
| Example: | |
| ```python | |
| >>> from transformers import AutoTokenizer, Qwen2ForCausalLM | |
| >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) | |
| >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) | |
| >>> prompt = "Hey, are you conscious? Can you talk to me?" | |
| >>> inputs = tokenizer(prompt, return_tensors="pt") | |
| >>> # Generate | |
| >>> generate_ids = model.generate(inputs.input_ids, max_length=30) | |
| >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] | |
| "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." | |
| ```""" | |
| output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions | |
| output_hidden_states = ( | |
| output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states | |
| ) | |
| return_dict = return_dict if return_dict is not None else self.config.use_return_dict | |
| # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) | |
| ################################################################################################################# | |
| loss_fct = CrossEntropyLoss() | |
| # query_feats | |
| # value_feats | |
| with torch.no_grad(): | |
| outputs = self.model( | |
| input_ids=input_ids, | |
| attention_mask=attention_mask, | |
| position_ids=position_ids, | |
| past_key_values=past_key_values, | |
| inputs_embeds=inputs_embeds, | |
| use_cache=use_cache, | |
| output_attentions=output_attentions, | |
| output_hidden_states=output_hidden_states, | |
| return_dict=return_dict, | |
| cache_position=cache_position, | |
| ) | |
| hidden_states = outputs[0] | |
| input_feats = F.normalize( | |
| self.retrieve_proj(hidden_states[:, -1, :]), dim=-1 | |
| ) | |
| # 计算第一维度的一半 | |
| half_size = input_feats.shape[0] // 2 | |
| # 取出前一半 | |
| query_feats = input_feats[:half_size, :] | |
| # 取出后一半 | |
| value_feats = input_feats[half_size:, :] | |
| sim_t2q = torch.matmul( | |
| query_feats, value_feats.permute(1,0) | |
| ) | |
| sim_q2t = torch.matmul( | |
| value_feats, query_feats.permute(1,0) | |
| ) | |
| #n*n | |
| targets = torch.linspace(0, query_feats.size(0) - 1, query_feats.size(0), dtype=int).to( | |
| query_feats.device | |
| ) | |
| loss_itc = loss_fct(sim_t2q, targets) + loss_fct(sim_q2t, targets) | |
| # print(f'loss_itc\n{loss_itc}\n\nsim_t2q\n{sim_t2q}\n\nsim_q2t\n{sim_q2t}') | |
| return CausalLMOutputWithPast( | |
| loss=loss_itc, | |
| logits=None, | |
| past_key_values=outputs.past_key_values, | |
| hidden_states=outputs.hidden_states, | |
| attentions=outputs.attentions, | |
| ) | |
| # 送入模型前先tokenize | |
| def tokenize_batch(batch,tokenizer): | |
| # 前面半截是query,后面半截是value | |
| batch_list=[] | |
| batch_list.extend(batch['query']) | |
| batch_list.extend(batch['value']) | |
| batch_tokenize=tokenizer(batch_list,return_tensors='pt',padding='longest',return_token_type_ids=False) | |
| # 手动添加eos token | |
| batch_size=batch_tokenize['input_ids'].shape[0] | |
| eos_tokens = torch.full((batch_size, 1), tokenizer.eos_token_id, dtype=batch_tokenize['input_ids'].dtype) | |
| # 拼接EOS token到input_ids的末尾 | |
| batch_tokenize['input_ids'] = torch.cat([batch_tokenize['input_ids'], eos_tokens], dim=-1) | |
| # 对于attention_mask,我们也需要添加1来表明新添加的EOS token是有效的 | |
| attention_masks_eos = torch.ones((batch_size, 1), dtype=batch_tokenize['attention_mask'].dtype) | |
| batch_tokenize['attention_mask'] = torch.cat([batch_tokenize['attention_mask'], attention_masks_eos], dim=-1) | |
| return batch_tokenize | |
| # 保存模型,参数 | |
| def save_model_tokenizer_config_params(args,model_engine,tokenizer,training_params_save,epoch,step=None): | |
| if step is not None: | |
| dir_name=os.path.join(args.save_dir,f'epoch_{epoch}',f'step_{step}') | |
| else: | |
| dir_name=os.path.join(args.save_dir,f'epoch_{epoch}') | |
| model_engine.save_16bit_model(dir_name) | |
| tokenizer.save_pretrained(dir_name) | |
| model_engine.model.config.save_pretrained(dir_name) | |
| # 保存训练参数 | |
| with open(os.path.join(dir_name,'training_params_save.json'), 'w') as f: | |
| json.dump(training_params_save, f, indent=4) | |
| # 加载数据,并且选择刚好合适的大小 | |
| def load_training_dataset(args, file_folder_path): | |
| print('loading dataset \n') | |
| # 给定的路径是目录,从文件夹中加载parquet文件 | |
| if os.path.isdir(file_folder_path): | |
| parquet_files=[] | |
| for filepath,dirnames,filenames in os.walk(file_folder_path): | |
| for filename in filenames: | |
| if filename.endswith('parquet'): | |
| fullname = os.path.join(filepath, filename) | |
| parquet_files.append(fullname) | |
| ds = load_dataset("parquet", data_files=parquet_files)['train'] | |
| # 给定的路径是文件名,从文件中加载parquet文件 | |
| elif os.path.isfile(file_folder_path): | |
| ds = load_dataset("parquet", data_files=file_folder_path)['train'] | |
| # shuffle数据集 | |
| ds = ds.shuffle(seed=args.seed) | |
| return ds | |
| # 初始化模型 | |
| def initialize_model_tokenizer(args): | |
| tokenizer = AutoTokenizer.from_pretrained(args.model) | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| tokenizer.padding_side='left' | |
| tokenizer.add_bos_token=False | |
| tokenizer.add_eos_token=False | |
| print('initializing model \n') | |
| config = AutoConfig.from_pretrained(args.model) | |
| if args.gradient_checkpointing: | |
| config.gradient_checkpointing = True | |
| config.use_cache = False | |
| model = Qwen2ForCausalLM.from_pretrained(args.model, config=config) | |
| # if hasattr(model, "enable_input_require_grads"): | |
| # model.enable_input_require_grads() | |
| # else: | |
| # def make_inputs_require_grad(module, input, output): | |
| # output.requires_grad_(True) | |
| # model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) | |
| model.train() | |
| def find_all_linear_names(model): | |
| cls = torch.nn.Linear | |
| lora_module_names = set() | |
| for name, module in model.named_modules(): | |
| if isinstance(module, cls): | |
| names = name.split('.') | |
| lora_module_names.add(names[0] if len(names) == 1 else names[-1]) | |
| if 'lm_head' in lora_module_names: # needed for 16-bit | |
| lora_module_names.remove('lm_head') | |
| return list(lora_module_names) | |
| ###############################################LoRA | |
| if LoRA: | |
| loraconfig = LoraConfig( | |
| r=128, | |
| lora_alpha=256, | |
| target_modules=find_all_linear_names(model), | |
| lora_dropout=0.05, | |
| bias="none", | |
| task_type="CAUSAL_LM", | |
| ) | |
| model = get_peft_model(model, loraconfig) | |
| # model=PeftModel.from_pretrained(model, args.save_dir) | |
| # TODO(enijkamp): we need to set this flag twice? | |
| #forget all activations during forward and recompute during the backward. | |
| if args.gradient_checkpointing: | |
| model.gradient_checkpointing_enable() | |
| return model,tokenizer | |
| def deepspeed_train(args,ds,model,tokenizer): | |
| set_seed(args.seed) | |
| set_cuda(deterministic=deterministic) | |
| print('initializing deepspeed \n') | |
| model_parameters = list(filter(lambda p: p.requires_grad, model.parameters())) | |
| optimizer=AdamW(model_parameters, lr=lr, betas=(0.9, 0.999),eps=1e-8,weight_decay=0.05) | |
| def _get_cosine_schedule_with_warmup_lr_lambda(current_step: int, *, num_warmup_steps: int, num_training_steps: int, num_cycles: float): | |
| if current_step < num_warmup_steps: | |
| return float(current_step) / float(max(1, num_warmup_steps)) | |
| progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps)) | |
| return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))) | |
| lr_lambda = partial( | |
| _get_cosine_schedule_with_warmup_lr_lambda, | |
| num_warmup_steps=100, | |
| num_training_steps=num_epochs*steps, | |
| num_cycles=0.5, | |
| ) | |
| scheduler=LambdaLR(optimizer, lr_lambda) | |
| model_engine, optimizer, dataloader, lr_scheduler = deepspeed.initialize(config=args.deepspeed_config, model=model, model_parameters=model_parameters,training_data=ds,optimizer=optimizer, lr_scheduler=scheduler,) | |
| torch.cuda.empty_cache() | |
| print('starting training \n') | |
| training_params_save={ | |
| 'epoch':[], | |
| 'step':[], | |
| 'loss':[], | |
| 'lr':[], | |
| 'cost_time':[], | |
| 'total_cost_time':[], | |
| 'percent':[], | |
| } | |
| total_start_time=time() | |
| for epoch in range(1,num_epochs+1): | |
| for step, batch in enumerate(dataloader, start=1): | |
| batch=tokenize_batch(batch,tokenizer) | |
| start_time=time() | |
| batch.to(model_engine.device) | |
| loss = model_engine(input_ids=batch['input_ids'],attention_mask=batch['attention_mask']).loss | |
| model_engine.backward(loss) | |
| model_engine.step() | |
| if step % steps_per_print == 0: | |
| # 打印训练相关信息 | |
| percent=step*args.deepspeed_config['train_micro_batch_size_per_gpu']*num_gpus/len(ds) | |
| cost_time=time()-start_time | |
| total_cost_time=time()-total_start_time | |
| print(f'epoch {epoch} step {step} {percent:.2%} loss:{loss:8.3f} time:{cost_time:.2f} total time:{total_cost_time:.2f}\n') | |
| meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle) | |
| print(f'GPU State --- Free:{meminfo.free/1024**2} Used:{meminfo.used/1024**2} Total:{meminfo.total/1024**2}\n') #总的显存大小(float) | |
| utilization = pynvml.nvmlDeviceGetUtilizationRates(handle) | |
| print(f'GPU Util --- {utilization.gpu}%\n') # gpu利用率 | |
| # 保存训练相关参数 | |
| training_params_save['epoch'].append(epoch) | |
| training_params_save['step'].append(step) | |
| training_params_save['loss'].append(float(loss)) | |
| training_params_save['lr'].append(optimizer.param_groups[0]['lr']) | |
| training_params_save['cost_time'].append(round(cost_time,4)) | |
| training_params_save['total_cost_time'].append(round(total_cost_time,4)) | |
| training_params_save['percent'].append(round(percent,4)) | |
| # 每隔steps_per_save step保存一次 | |
| if save_flag and step%steps_per_save==0: | |
| save_model_tokenizer_config_params(args,model_engine,tokenizer,training_params_save,epoch,step) | |
| # 每个epoch保存一次 | |
| if save_flag: | |
| save_model_tokenizer_config_params(args,model_engine,tokenizer,training_params_save,epoch) | |
| if __name__ == '__main__': | |
| args=argparse.Namespace() | |
| args.seed = 0 | |
| model_name='Qwen2p5Coder_1p5B' | |
| args.model = f'/home/chenzy/models/{model_name}' | |
| args.training_dataset = '/home/chenzy/sysu_datasets/SolRet_training' | |
| args.save_dir=f'/home/chenzy/models/save/{model_name}_SolRet' | |
| ds=load_training_dataset(args, args.training_dataset) | |
| lr=1e-4 | |
| train_micro_batch_size_per_gpu=128 | |
| gradient_accumulation_steps=1 | |
| num_gpus=4 | |
| dataset_len=len(ds) | |
| steps=dataset_len//(num_gpus*train_micro_batch_size_per_gpu) | |
| # steps=5 | |
| select_dataset_num=steps*num_gpus*train_micro_batch_size_per_gpu | |
| ds=ds.select(range(select_dataset_num)) | |
| num_epochs=3 | |
| steps_per_print=25 | |
| steps_per_save=steps//4 | |
| args.gradient_checkpointing=True | |
| save_flag=True | |
| deterministic=True | |
| LoRA=False | |
| pynvml.nvmlInit() | |
| handle = pynvml.nvmlDeviceGetHandleByIndex(0) # 指定显卡号 | |
| model,tokenizer=initialize_model_tokenizer(args) | |
| DEEPSPEED_CONFIG = \ | |
| { | |
| # 'fp16': {'enabled': True, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 12, 'hysteresis': 2, 'min_loss_scale': 1}, | |
| # 'bf16': {'enabled':True}, | |
| 'fp16': {'enabled':True}, | |
| # 'optimizer': {'type': 'AdamW', 'params': {'lr': 1e-05, 'betas': [0.9, 0.95], 'eps': 1e-08, 'weight_decay': 0.0}}, | |
| # 'scheduler': {'type': 'WarmupLR', 'params': {'warmup_min_lr': 0, 'warmup_max_lr': 1e-05, 'warmup_num_steps': 100}}, | |
| #stage 1: optimizer states | |
| #stage 2: gradients | |
| #stage 3: model parameters | |
| # 'zero_optimization': { | |
| # 'stage': 2, | |
| # #pin_memory improve the throughput | |
| # #device: or none | |
| # # 'offload_optimizer': {'device': 'cpu', 'pin_memory': True}, | |
| # # 'offload _param': {'device': 'cpu', 'pin_memory': True}, | |
| # #trade off GPU Ram and latency | |
| # 'overlap_comm': True, | |
| # # reduce memory fragmentation | |
| # 'contiguous_gradients': True, | |
| # # default =1e9, when not using NVMe | |
| # # 'sub_group_size': 1e9, | |
| # # 'reduce_bucket_size': 16777216, | |
| # # 'stage3_prefetch_bucket_size': 15099494.4, | |
| # # 'stage3_param_persistence_threshold': 40960, | |
| # # 'gather_16bit_weights_on_model_save': True, | |
| # }, | |
| 'zero_optimization': {'stage': 0}, | |
| # This is the amount of data samples that leads to one step of model update.train_batch_size must be equal to train_micro_batch_size_per_gpu * gradient_accumulation * number of GPUs | |
| #'train_batch_size': train_batch_size, | |
| # Batch size to be processed by one GPU in one step (without gradient accumulation). | |
| 'train_micro_batch_size_per_gpu': train_micro_batch_size_per_gpu, | |
| # Number of training steps to accumulate gradients before averaging and applying them. | |
| 'gradient_accumulation_steps': gradient_accumulation_steps, | |
| 'gradient_clipping': 1.0, | |
| # Print progress report every N training steps. The report includes the number of training steps, number of skipped optimizer updates | |
| 'steps_per_print': steps_per_print*5, | |
| 'wall_clock_breakdown': False, | |
| 'compression_training': {'weight_quantization': {'shared_parameters': {}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {}, 'different_groups': {}}} | |
| } | |
| args.deepspeed_config = DEEPSPEED_CONFIG | |
| deepspeed_train(args=args,ds=ds,model=model,tokenizer=tokenizer) | |
| ``` |