Qwen2p5Coder_1p5B_SolRet / Qwen2p5Coder_1p5B_SolRet.md
zychen22's picture
Add files using upload-large-folder tool
a478227 verified
# 处理方法
- 把训练集SolRet_training的query和value做对比学习
# 参考代码
```python
########################################################################### imports
import os
import json
# os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
import argparse
import random
import math
from time import time
import numpy as np
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import LambdaLR
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from transformers.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM as Qwen2ForCausalLMOrig
from transformers.models.qwen2.modeling_qwen2 import QWEN2_INPUTS_DOCSTRING,_CONFIG_FOR_DOC
from transformers.utils import add_start_docstrings_to_model_forward, replace_return_docstrings
from typing import List, Optional, Tuple, Union
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from torch.nn import functional as F
from transformers.modeling_outputs import CausalLMOutputWithPast
from datasets import load_dataset
import deepspeed
from peft import LoraConfig, get_peft_model, PeftModel
from functools import partial
import pynvml
class print_time:
def __init__(self, *desc):
self.desc = desc
def __enter__(self):
print(*self.desc)
self.t = time()
def __exit__(self, type, value, traceback):
print(f'{time()-self.t:.02f}s')
# 指定 gpu
def set_gpus(gpu):
torch.cuda.set_device(gpu)
def set_seed(seed):
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
def set_cuda(deterministic=True):
if torch.cuda.is_available():
# 卷积算法确定
torch.backends.cudnn.deterministic = deterministic
# 设置 torch.backends.cudnn.benchmark=True 将会让程序在开始时花费一点额外时间,为整个网络的每个卷积层搜索最适合它的卷积实现算法,进而实现网络的加速。适用场景是网络结构固定(不是动态变化的),网络的输入形状(包括 batch size,图片大小,输入的通道)是不变的,其实也就是一般情况下都比较适用。反之,如果卷积层的设置一直变化,将会导致程序不停地做优化,反而会耗费更多的时间。
torch.backends.cudnn.benchmark = not deterministic
class Qwen2ForCausalLM(Qwen2ForCausalLMOrig):
def __init__(self, config):
super().__init__(config)
retrieve_size=256
self.retrieve_proj = nn.Linear(config.hidden_size, retrieve_size, bias=False)
@add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
) -> Union[Tuple, CausalLMOutputWithPast]:
r"""
Args:
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
Example:
```python
>>> from transformers import AutoTokenizer, Qwen2ForCausalLM
>>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
>>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")
>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
#################################################################################################################
loss_fct = CrossEntropyLoss()
# query_feats
# value_feats
with torch.no_grad():
outputs = self.model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
cache_position=cache_position,
)
hidden_states = outputs[0]
input_feats = F.normalize(
self.retrieve_proj(hidden_states[:, -1, :]), dim=-1
)
# 计算第一维度的一半
half_size = input_feats.shape[0] // 2
# 取出前一半
query_feats = input_feats[:half_size, :]
# 取出后一半
value_feats = input_feats[half_size:, :]
sim_t2q = torch.matmul(
query_feats, value_feats.permute(1,0)
)
sim_q2t = torch.matmul(
value_feats, query_feats.permute(1,0)
)
#n*n
targets = torch.linspace(0, query_feats.size(0) - 1, query_feats.size(0), dtype=int).to(
query_feats.device
)
loss_itc = loss_fct(sim_t2q, targets) + loss_fct(sim_q2t, targets)
# print(f'loss_itc\n{loss_itc}\n\nsim_t2q\n{sim_t2q}\n\nsim_q2t\n{sim_q2t}')
return CausalLMOutputWithPast(
loss=loss_itc,
logits=None,
past_key_values=outputs.past_key_values,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# 送入模型前先tokenize
def tokenize_batch(batch,tokenizer):
# 前面半截是query,后面半截是value
batch_list=[]
batch_list.extend(batch['query'])
batch_list.extend(batch['value'])
batch_tokenize=tokenizer(batch_list,return_tensors='pt',padding='longest',return_token_type_ids=False)
# 手动添加eos token
batch_size=batch_tokenize['input_ids'].shape[0]
eos_tokens = torch.full((batch_size, 1), tokenizer.eos_token_id, dtype=batch_tokenize['input_ids'].dtype)
# 拼接EOS token到input_ids的末尾
batch_tokenize['input_ids'] = torch.cat([batch_tokenize['input_ids'], eos_tokens], dim=-1)
# 对于attention_mask,我们也需要添加1来表明新添加的EOS token是有效的
attention_masks_eos = torch.ones((batch_size, 1), dtype=batch_tokenize['attention_mask'].dtype)
batch_tokenize['attention_mask'] = torch.cat([batch_tokenize['attention_mask'], attention_masks_eos], dim=-1)
return batch_tokenize
# 保存模型,参数
def save_model_tokenizer_config_params(args,model_engine,tokenizer,training_params_save,epoch,step=None):
if step is not None:
dir_name=os.path.join(args.save_dir,f'epoch_{epoch}',f'step_{step}')
else:
dir_name=os.path.join(args.save_dir,f'epoch_{epoch}')
model_engine.save_16bit_model(dir_name)
tokenizer.save_pretrained(dir_name)
model_engine.model.config.save_pretrained(dir_name)
# 保存训练参数
with open(os.path.join(dir_name,'training_params_save.json'), 'w') as f:
json.dump(training_params_save, f, indent=4)
# 加载数据,并且选择刚好合适的大小
def load_training_dataset(args, file_folder_path):
print('loading dataset \n')
# 给定的路径是目录,从文件夹中加载parquet文件
if os.path.isdir(file_folder_path):
parquet_files=[]
for filepath,dirnames,filenames in os.walk(file_folder_path):
for filename in filenames:
if filename.endswith('parquet'):
fullname = os.path.join(filepath, filename)
parquet_files.append(fullname)
ds = load_dataset("parquet", data_files=parquet_files)['train']
# 给定的路径是文件名,从文件中加载parquet文件
elif os.path.isfile(file_folder_path):
ds = load_dataset("parquet", data_files=file_folder_path)['train']
# shuffle数据集
ds = ds.shuffle(seed=args.seed)
return ds
# 初始化模型
def initialize_model_tokenizer(args):
tokenizer = AutoTokenizer.from_pretrained(args.model)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side='left'
tokenizer.add_bos_token=False
tokenizer.add_eos_token=False
print('initializing model \n')
config = AutoConfig.from_pretrained(args.model)
if args.gradient_checkpointing:
config.gradient_checkpointing = True
config.use_cache = False
model = Qwen2ForCausalLM.from_pretrained(args.model, config=config)
# if hasattr(model, "enable_input_require_grads"):
# model.enable_input_require_grads()
# else:
# def make_inputs_require_grad(module, input, output):
# output.requires_grad_(True)
# model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
model.train()
def find_all_linear_names(model):
cls = torch.nn.Linear
lora_module_names = set()
for name, module in model.named_modules():
if isinstance(module, cls):
names = name.split('.')
lora_module_names.add(names[0] if len(names) == 1 else names[-1])
if 'lm_head' in lora_module_names: # needed for 16-bit
lora_module_names.remove('lm_head')
return list(lora_module_names)
###############################################LoRA
if LoRA:
loraconfig = LoraConfig(
r=128,
lora_alpha=256,
target_modules=find_all_linear_names(model),
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
)
model = get_peft_model(model, loraconfig)
# model=PeftModel.from_pretrained(model, args.save_dir)
# TODO(enijkamp): we need to set this flag twice?
#forget all activations during forward and recompute during the backward.
if args.gradient_checkpointing:
model.gradient_checkpointing_enable()
return model,tokenizer
def deepspeed_train(args,ds,model,tokenizer):
set_seed(args.seed)
set_cuda(deterministic=deterministic)
print('initializing deepspeed \n')
model_parameters = list(filter(lambda p: p.requires_grad, model.parameters()))
optimizer=AdamW(model_parameters, lr=lr, betas=(0.9, 0.999),eps=1e-8,weight_decay=0.05)
def _get_cosine_schedule_with_warmup_lr_lambda(current_step: int, *, num_warmup_steps: int, num_training_steps: int, num_cycles: float):
if current_step < num_warmup_steps:
return float(current_step) / float(max(1, num_warmup_steps))
progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
lr_lambda = partial(
_get_cosine_schedule_with_warmup_lr_lambda,
num_warmup_steps=100,
num_training_steps=num_epochs*steps,
num_cycles=0.5,
)
scheduler=LambdaLR(optimizer, lr_lambda)
model_engine, optimizer, dataloader, lr_scheduler = deepspeed.initialize(config=args.deepspeed_config, model=model, model_parameters=model_parameters,training_data=ds,optimizer=optimizer, lr_scheduler=scheduler,)
torch.cuda.empty_cache()
print('starting training \n')
training_params_save={
'epoch':[],
'step':[],
'loss':[],
'lr':[],
'cost_time':[],
'total_cost_time':[],
'percent':[],
}
total_start_time=time()
for epoch in range(1,num_epochs+1):
for step, batch in enumerate(dataloader, start=1):
batch=tokenize_batch(batch,tokenizer)
start_time=time()
batch.to(model_engine.device)
loss = model_engine(input_ids=batch['input_ids'],attention_mask=batch['attention_mask']).loss
model_engine.backward(loss)
model_engine.step()
if step % steps_per_print == 0:
# 打印训练相关信息
percent=step*args.deepspeed_config['train_micro_batch_size_per_gpu']*num_gpus/len(ds)
cost_time=time()-start_time
total_cost_time=time()-total_start_time
print(f'epoch {epoch} step {step} {percent:.2%} loss:{loss:8.3f} time:{cost_time:.2f} total time:{total_cost_time:.2f}\n')
meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
print(f'GPU State --- Free:{meminfo.free/1024**2} Used:{meminfo.used/1024**2} Total:{meminfo.total/1024**2}\n') #总的显存大小(float)
utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
print(f'GPU Util --- {utilization.gpu}%\n') # gpu利用率
# 保存训练相关参数
training_params_save['epoch'].append(epoch)
training_params_save['step'].append(step)
training_params_save['loss'].append(float(loss))
training_params_save['lr'].append(optimizer.param_groups[0]['lr'])
training_params_save['cost_time'].append(round(cost_time,4))
training_params_save['total_cost_time'].append(round(total_cost_time,4))
training_params_save['percent'].append(round(percent,4))
# 每隔steps_per_save step保存一次
if save_flag and step%steps_per_save==0:
save_model_tokenizer_config_params(args,model_engine,tokenizer,training_params_save,epoch,step)
# 每个epoch保存一次
if save_flag:
save_model_tokenizer_config_params(args,model_engine,tokenizer,training_params_save,epoch)
if __name__ == '__main__':
args=argparse.Namespace()
args.seed = 0
model_name='Qwen2p5Coder_1p5B'
args.model = f'/home/chenzy/models/{model_name}'
args.training_dataset = '/home/chenzy/sysu_datasets/SolRet_training'
args.save_dir=f'/home/chenzy/models/save/{model_name}_SolRet'
ds=load_training_dataset(args, args.training_dataset)
lr=1e-4
train_micro_batch_size_per_gpu=128
gradient_accumulation_steps=1
num_gpus=4
dataset_len=len(ds)
steps=dataset_len//(num_gpus*train_micro_batch_size_per_gpu)
# steps=5
select_dataset_num=steps*num_gpus*train_micro_batch_size_per_gpu
ds=ds.select(range(select_dataset_num))
num_epochs=3
steps_per_print=25
steps_per_save=steps//4
args.gradient_checkpointing=True
save_flag=True
deterministic=True
LoRA=False
pynvml.nvmlInit()
handle = pynvml.nvmlDeviceGetHandleByIndex(0) # 指定显卡号
model,tokenizer=initialize_model_tokenizer(args)
DEEPSPEED_CONFIG = \
{
# 'fp16': {'enabled': True, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 12, 'hysteresis': 2, 'min_loss_scale': 1},
# 'bf16': {'enabled':True},
'fp16': {'enabled':True},
# 'optimizer': {'type': 'AdamW', 'params': {'lr': 1e-05, 'betas': [0.9, 0.95], 'eps': 1e-08, 'weight_decay': 0.0}},
# 'scheduler': {'type': 'WarmupLR', 'params': {'warmup_min_lr': 0, 'warmup_max_lr': 1e-05, 'warmup_num_steps': 100}},
#stage 1: optimizer states
#stage 2: gradients
#stage 3: model parameters
# 'zero_optimization': {
# 'stage': 2,
# #pin_memory improve the throughput
# #device: or none
# # 'offload_optimizer': {'device': 'cpu', 'pin_memory': True},
# # 'offload _param': {'device': 'cpu', 'pin_memory': True},
# #trade off GPU Ram and latency
# 'overlap_comm': True,
# # reduce memory fragmentation
# 'contiguous_gradients': True,
# # default =1e9, when not using NVMe
# # 'sub_group_size': 1e9,
# # 'reduce_bucket_size': 16777216,
# # 'stage3_prefetch_bucket_size': 15099494.4,
# # 'stage3_param_persistence_threshold': 40960,
# # 'gather_16bit_weights_on_model_save': True,
# },
'zero_optimization': {'stage': 0},
# This is the amount of data samples that leads to one step of model update.train_batch_size must be equal to train_micro_batch_size_per_gpu * gradient_accumulation * number of GPUs
#'train_batch_size': train_batch_size,
# Batch size to be processed by one GPU in one step (without gradient accumulation).
'train_micro_batch_size_per_gpu': train_micro_batch_size_per_gpu,
# Number of training steps to accumulate gradients before averaging and applying them.
'gradient_accumulation_steps': gradient_accumulation_steps,
'gradient_clipping': 1.0,
# Print progress report every N training steps. The report includes the number of training steps, number of skipped optimizer updates
'steps_per_print': steps_per_print*5,
'wall_clock_breakdown': False,
'compression_training': {'weight_quantization': {'shared_parameters': {}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {}, 'different_groups': {}}}
}
args.deepspeed_config = DEEPSPEED_CONFIG
deepspeed_train(args=args,ds=ds,model=model,tokenizer=tokenizer)
```