# 处理方法 - 把训练集SolRet_training的query和value做对比学习 # 参考代码 ```python ########################################################################### imports import os import json # os.environ['CUDA_VISIBLE_DEVICES'] = '0,1' import argparse import random import math from time import time import numpy as np import torch from torch.utils.data import DataLoader from torch.optim import AdamW from torch.optim.lr_scheduler import LambdaLR from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer from transformers.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM as Qwen2ForCausalLMOrig from transformers.models.qwen2.modeling_qwen2 import QWEN2_INPUTS_DOCSTRING,_CONFIG_FOR_DOC from transformers.utils import add_start_docstrings_to_model_forward, replace_return_docstrings from typing import List, Optional, Tuple, Union from torch import nn from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from torch.nn import functional as F from transformers.modeling_outputs import CausalLMOutputWithPast from datasets import load_dataset import deepspeed from peft import LoraConfig, get_peft_model, PeftModel from functools import partial import pynvml class print_time: def __init__(self, *desc): self.desc = desc def __enter__(self): print(*self.desc) self.t = time() def __exit__(self, type, value, traceback): print(f'{time()-self.t:.02f}s') # 指定 gpu def set_gpus(gpu): torch.cuda.set_device(gpu) def set_seed(seed): os.environ['PYTHONHASHSEED'] = str(seed) random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) def set_cuda(deterministic=True): if torch.cuda.is_available(): # 卷积算法确定 torch.backends.cudnn.deterministic = deterministic # 设置 torch.backends.cudnn.benchmark=True 将会让程序在开始时花费一点额外时间,为整个网络的每个卷积层搜索最适合它的卷积实现算法,进而实现网络的加速。适用场景是网络结构固定(不是动态变化的),网络的输入形状(包括 batch size,图片大小,输入的通道)是不变的,其实也就是一般情况下都比较适用。反之,如果卷积层的设置一直变化,将会导致程序不停地做优化,反而会耗费更多的时间。 torch.backends.cudnn.benchmark = not deterministic class Qwen2ForCausalLM(Qwen2ForCausalLMOrig): def __init__(self, config): super().__init__(config) retrieve_size=256 self.retrieve_proj = nn.Linear(config.hidden_size, retrieve_size, bias=False) @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) def forward( self, input_ids: torch.LongTensor = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, inputs_embeds: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" Args: labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. Returns: Example: ```python >>> from transformers import AutoTokenizer, Qwen2ForCausalLM >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) >>> prompt = "Hey, are you conscious? Can you talk to me?" >>> inputs = tokenizer(prompt, return_tensors="pt") >>> # Generate >>> generate_ids = model.generate(inputs.input_ids, max_length=30) >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." ```""" output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) ################################################################################################################# loss_fct = CrossEntropyLoss() # query_feats # value_feats with torch.no_grad(): outputs = self.model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, cache_position=cache_position, ) hidden_states = outputs[0] input_feats = F.normalize( self.retrieve_proj(hidden_states[:, -1, :]), dim=-1 ) # 计算第一维度的一半 half_size = input_feats.shape[0] // 2 # 取出前一半 query_feats = input_feats[:half_size, :] # 取出后一半 value_feats = input_feats[half_size:, :] sim_t2q = torch.matmul( query_feats, value_feats.permute(1,0) ) sim_q2t = torch.matmul( value_feats, query_feats.permute(1,0) ) #n*n targets = torch.linspace(0, query_feats.size(0) - 1, query_feats.size(0), dtype=int).to( query_feats.device ) loss_itc = loss_fct(sim_t2q, targets) + loss_fct(sim_q2t, targets) # print(f'loss_itc\n{loss_itc}\n\nsim_t2q\n{sim_t2q}\n\nsim_q2t\n{sim_q2t}') return CausalLMOutputWithPast( loss=loss_itc, logits=None, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) # 送入模型前先tokenize def tokenize_batch(batch,tokenizer): # 前面半截是query,后面半截是value batch_list=[] batch_list.extend(batch['query']) batch_list.extend(batch['value']) batch_tokenize=tokenizer(batch_list,return_tensors='pt',padding='longest',return_token_type_ids=False) # 手动添加eos token batch_size=batch_tokenize['input_ids'].shape[0] eos_tokens = torch.full((batch_size, 1), tokenizer.eos_token_id, dtype=batch_tokenize['input_ids'].dtype) # 拼接EOS token到input_ids的末尾 batch_tokenize['input_ids'] = torch.cat([batch_tokenize['input_ids'], eos_tokens], dim=-1) # 对于attention_mask,我们也需要添加1来表明新添加的EOS token是有效的 attention_masks_eos = torch.ones((batch_size, 1), dtype=batch_tokenize['attention_mask'].dtype) batch_tokenize['attention_mask'] = torch.cat([batch_tokenize['attention_mask'], attention_masks_eos], dim=-1) return batch_tokenize # 保存模型,参数 def save_model_tokenizer_config_params(args,model_engine,tokenizer,training_params_save,epoch,step=None): if step is not None: dir_name=os.path.join(args.save_dir,f'epoch_{epoch}',f'step_{step}') else: dir_name=os.path.join(args.save_dir,f'epoch_{epoch}') model_engine.save_16bit_model(dir_name) tokenizer.save_pretrained(dir_name) model_engine.model.config.save_pretrained(dir_name) # 保存训练参数 with open(os.path.join(dir_name,'training_params_save.json'), 'w') as f: json.dump(training_params_save, f, indent=4) # 加载数据,并且选择刚好合适的大小 def load_training_dataset(args, file_folder_path): print('loading dataset \n') # 给定的路径是目录,从文件夹中加载parquet文件 if os.path.isdir(file_folder_path): parquet_files=[] for filepath,dirnames,filenames in os.walk(file_folder_path): for filename in filenames: if filename.endswith('parquet'): fullname = os.path.join(filepath, filename) parquet_files.append(fullname) ds = load_dataset("parquet", data_files=parquet_files)['train'] # 给定的路径是文件名,从文件中加载parquet文件 elif os.path.isfile(file_folder_path): ds = load_dataset("parquet", data_files=file_folder_path)['train'] # shuffle数据集 ds = ds.shuffle(seed=args.seed) return ds # 初始化模型 def initialize_model_tokenizer(args): tokenizer = AutoTokenizer.from_pretrained(args.model) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side='left' tokenizer.add_bos_token=False tokenizer.add_eos_token=False print('initializing model \n') config = AutoConfig.from_pretrained(args.model) if args.gradient_checkpointing: config.gradient_checkpointing = True config.use_cache = False model = Qwen2ForCausalLM.from_pretrained(args.model, config=config) # if hasattr(model, "enable_input_require_grads"): # model.enable_input_require_grads() # else: # def make_inputs_require_grad(module, input, output): # output.requires_grad_(True) # model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) model.train() def find_all_linear_names(model): cls = torch.nn.Linear lora_module_names = set() for name, module in model.named_modules(): if isinstance(module, cls): names = name.split('.') lora_module_names.add(names[0] if len(names) == 1 else names[-1]) if 'lm_head' in lora_module_names: # needed for 16-bit lora_module_names.remove('lm_head') return list(lora_module_names) ###############################################LoRA if LoRA: loraconfig = LoraConfig( r=128, lora_alpha=256, target_modules=find_all_linear_names(model), lora_dropout=0.05, bias="none", task_type="CAUSAL_LM", ) model = get_peft_model(model, loraconfig) # model=PeftModel.from_pretrained(model, args.save_dir) # TODO(enijkamp): we need to set this flag twice? #forget all activations during forward and recompute during the backward. if args.gradient_checkpointing: model.gradient_checkpointing_enable() return model,tokenizer def deepspeed_train(args,ds,model,tokenizer): set_seed(args.seed) set_cuda(deterministic=deterministic) print('initializing deepspeed \n') model_parameters = list(filter(lambda p: p.requires_grad, model.parameters())) optimizer=AdamW(model_parameters, lr=lr, betas=(0.9, 0.999),eps=1e-8,weight_decay=0.05) def _get_cosine_schedule_with_warmup_lr_lambda(current_step: int, *, num_warmup_steps: int, num_training_steps: int, num_cycles: float): if current_step < num_warmup_steps: return float(current_step) / float(max(1, num_warmup_steps)) progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps)) return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))) lr_lambda = partial( _get_cosine_schedule_with_warmup_lr_lambda, num_warmup_steps=100, num_training_steps=num_epochs*steps, num_cycles=0.5, ) scheduler=LambdaLR(optimizer, lr_lambda) model_engine, optimizer, dataloader, lr_scheduler = deepspeed.initialize(config=args.deepspeed_config, model=model, model_parameters=model_parameters,training_data=ds,optimizer=optimizer, lr_scheduler=scheduler,) torch.cuda.empty_cache() print('starting training \n') training_params_save={ 'epoch':[], 'step':[], 'loss':[], 'lr':[], 'cost_time':[], 'total_cost_time':[], 'percent':[], } total_start_time=time() for epoch in range(1,num_epochs+1): for step, batch in enumerate(dataloader, start=1): batch=tokenize_batch(batch,tokenizer) start_time=time() batch.to(model_engine.device) loss = model_engine(input_ids=batch['input_ids'],attention_mask=batch['attention_mask']).loss model_engine.backward(loss) model_engine.step() if step % steps_per_print == 0: # 打印训练相关信息 percent=step*args.deepspeed_config['train_micro_batch_size_per_gpu']*num_gpus/len(ds) cost_time=time()-start_time total_cost_time=time()-total_start_time print(f'epoch {epoch} step {step} {percent:.2%} loss:{loss:8.3f} time:{cost_time:.2f} total time:{total_cost_time:.2f}\n') meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle) print(f'GPU State --- Free:{meminfo.free/1024**2} Used:{meminfo.used/1024**2} Total:{meminfo.total/1024**2}\n') #总的显存大小(float) utilization = pynvml.nvmlDeviceGetUtilizationRates(handle) print(f'GPU Util --- {utilization.gpu}%\n') # gpu利用率 # 保存训练相关参数 training_params_save['epoch'].append(epoch) training_params_save['step'].append(step) training_params_save['loss'].append(float(loss)) training_params_save['lr'].append(optimizer.param_groups[0]['lr']) training_params_save['cost_time'].append(round(cost_time,4)) training_params_save['total_cost_time'].append(round(total_cost_time,4)) training_params_save['percent'].append(round(percent,4)) # 每隔steps_per_save step保存一次 if save_flag and step%steps_per_save==0: save_model_tokenizer_config_params(args,model_engine,tokenizer,training_params_save,epoch,step) # 每个epoch保存一次 if save_flag: save_model_tokenizer_config_params(args,model_engine,tokenizer,training_params_save,epoch) if __name__ == '__main__': args=argparse.Namespace() args.seed = 0 model_name='Qwen2p5Coder_1p5B' args.model = f'/home/chenzy/models/{model_name}' args.training_dataset = '/home/chenzy/sysu_datasets/SolRet_training' args.save_dir=f'/home/chenzy/models/save/{model_name}_SolRet' ds=load_training_dataset(args, args.training_dataset) lr=1e-4 train_micro_batch_size_per_gpu=128 gradient_accumulation_steps=1 num_gpus=4 dataset_len=len(ds) steps=dataset_len//(num_gpus*train_micro_batch_size_per_gpu) # steps=5 select_dataset_num=steps*num_gpus*train_micro_batch_size_per_gpu ds=ds.select(range(select_dataset_num)) num_epochs=3 steps_per_print=25 steps_per_save=steps//4 args.gradient_checkpointing=True save_flag=True deterministic=True LoRA=False pynvml.nvmlInit() handle = pynvml.nvmlDeviceGetHandleByIndex(0) # 指定显卡号 model,tokenizer=initialize_model_tokenizer(args) DEEPSPEED_CONFIG = \ { # 'fp16': {'enabled': True, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 12, 'hysteresis': 2, 'min_loss_scale': 1}, # 'bf16': {'enabled':True}, 'fp16': {'enabled':True}, # 'optimizer': {'type': 'AdamW', 'params': {'lr': 1e-05, 'betas': [0.9, 0.95], 'eps': 1e-08, 'weight_decay': 0.0}}, # 'scheduler': {'type': 'WarmupLR', 'params': {'warmup_min_lr': 0, 'warmup_max_lr': 1e-05, 'warmup_num_steps': 100}}, #stage 1: optimizer states #stage 2: gradients #stage 3: model parameters # 'zero_optimization': { # 'stage': 2, # #pin_memory improve the throughput # #device: or none # # 'offload_optimizer': {'device': 'cpu', 'pin_memory': True}, # # 'offload _param': {'device': 'cpu', 'pin_memory': True}, # #trade off GPU Ram and latency # 'overlap_comm': True, # # reduce memory fragmentation # 'contiguous_gradients': True, # # default =1e9, when not using NVMe # # 'sub_group_size': 1e9, # # 'reduce_bucket_size': 16777216, # # 'stage3_prefetch_bucket_size': 15099494.4, # # 'stage3_param_persistence_threshold': 40960, # # 'gather_16bit_weights_on_model_save': True, # }, 'zero_optimization': {'stage': 0}, # This is the amount of data samples that leads to one step of model update.train_batch_size must be equal to train_micro_batch_size_per_gpu * gradient_accumulation * number of GPUs #'train_batch_size': train_batch_size, # Batch size to be processed by one GPU in one step (without gradient accumulation). 'train_micro_batch_size_per_gpu': train_micro_batch_size_per_gpu, # Number of training steps to accumulate gradients before averaging and applying them. 'gradient_accumulation_steps': gradient_accumulation_steps, 'gradient_clipping': 1.0, # Print progress report every N training steps. The report includes the number of training steps, number of skipped optimizer updates 'steps_per_print': steps_per_print*5, 'wall_clock_breakdown': False, 'compression_training': {'weight_quantization': {'shared_parameters': {}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {}, 'different_groups': {}}} } args.deepspeed_config = DEEPSPEED_CONFIG deepspeed_train(args=args,ds=ds,model=model,tokenizer=tokenizer) ```