处理方法
- 把训练集SolRet_training的query和value做对比学习
参考代码
import os
import json
import argparse
import random
import math
from time import time
import numpy as np
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import LambdaLR
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from transformers.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM as Qwen2ForCausalLMOrig
from transformers.models.qwen2.modeling_qwen2 import QWEN2_INPUTS_DOCSTRING,_CONFIG_FOR_DOC
from transformers.utils import add_start_docstrings_to_model_forward, replace_return_docstrings
from typing import List, Optional, Tuple, Union
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from torch.nn import functional as F
from transformers.modeling_outputs import CausalLMOutputWithPast
from datasets import load_dataset
import deepspeed
from peft import LoraConfig, get_peft_model, PeftModel
from functools import partial
import pynvml
class print_time:
def __init__(self, *desc):
self.desc = desc
def __enter__(self):
print(*self.desc)
self.t = time()
def __exit__(self, type, value, traceback):
print(f'{time()-self.t:.02f}s')
def set_gpus(gpu):
torch.cuda.set_device(gpu)
def set_seed(seed):
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
def set_cuda(deterministic=True):
if torch.cuda.is_available():
torch.backends.cudnn.deterministic = deterministic
torch.backends.cudnn.benchmark = not deterministic
class Qwen2ForCausalLM(Qwen2ForCausalLMOrig):
def __init__(self, config):
super().__init__(config)
retrieve_size=256
self.retrieve_proj = nn.Linear(config.hidden_size, retrieve_size, bias=False)
@add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
) -> Union[Tuple, CausalLMOutputWithPast]:
r"""
Args:
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
Example:
```python
>>> from transformers import AutoTokenizer, Qwen2ForCausalLM
>>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
>>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")
>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
loss_fct = CrossEntropyLoss()
with torch.no_grad():
outputs = self.model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
cache_position=cache_position,
)
hidden_states = outputs[0]
input_feats = F.normalize(
self.retrieve_proj(hidden_states[:, -1, :]), dim=-1
)
half_size = input_feats.shape[0] // 2
query_feats = input_feats[:half_size, :]
value_feats = input_feats[half_size:, :]
sim_t2q = torch.matmul(
query_feats, value_feats.permute(1,0)
)
sim_q2t = torch.matmul(
value_feats, query_feats.permute(1,0)
)
targets = torch.linspace(0, query_feats.size(0) - 1, query_feats.size(0), dtype=int).to(
query_feats.device
)
loss_itc = loss_fct(sim_t2q, targets) + loss_fct(sim_q2t, targets)
return CausalLMOutputWithPast(
loss=loss_itc,
logits=None,
past_key_values=outputs.past_key_values,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def tokenize_batch(batch,tokenizer):
batch_list=[]
batch_list.extend(batch['query'])
batch_list.extend(batch['value'])
batch_tokenize=tokenizer(batch_list,return_tensors='pt',padding='longest',return_token_type_ids=False)
batch_size=batch_tokenize['input_ids'].shape[0]
eos_tokens = torch.full((batch_size, 1), tokenizer.eos_token_id, dtype=batch_tokenize['input_ids'].dtype)
batch_tokenize['input_ids'] = torch.cat([batch_tokenize['input_ids'], eos_tokens], dim=-1)
attention_masks_eos = torch.ones((batch_size, 1), dtype=batch_tokenize['attention_mask'].dtype)
batch_tokenize['attention_mask'] = torch.cat([batch_tokenize['attention_mask'], attention_masks_eos], dim=-1)
return batch_tokenize
def save_model_tokenizer_config_params(args,model_engine,tokenizer,training_params_save,epoch,step=None):
if step is not None:
dir_name=os.path.join(args.save_dir,f'epoch_{epoch}',f'step_{step}')
else:
dir_name=os.path.join(args.save_dir,f'epoch_{epoch}')
model_engine.save_16bit_model(dir_name)
tokenizer.save_pretrained(dir_name)
model_engine.model.config.save_pretrained(dir_name)
with open(os.path.join(dir_name,'training_params_save.json'), 'w') as f:
json.dump(training_params_save, f, indent=4)
def load_training_dataset(args, file_folder_path):
print('loading dataset \n')
if os.path.isdir(file_folder_path):
parquet_files=[]
for filepath,dirnames,filenames in os.walk(file_folder_path):
for filename in filenames:
if filename.endswith('parquet'):
fullname = os.path.join(filepath, filename)
parquet_files.append(fullname)
ds = load_dataset("parquet", data_files=parquet_files)['train']
elif os.path.isfile(file_folder_path):
ds = load_dataset("parquet", data_files=file_folder_path)['train']
ds = ds.shuffle(seed=args.seed)
return ds
def initialize_model_tokenizer(args):
tokenizer = AutoTokenizer.from_pretrained(args.model)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side='left'
tokenizer.add_bos_token=False
tokenizer.add_eos_token=False
print('initializing model \n')
config = AutoConfig.from_pretrained(args.model)
if args.gradient_checkpointing:
config.gradient_checkpointing = True
config.use_cache = False
model = Qwen2ForCausalLM.from_pretrained(args.model, config=config)
model.train()
def find_all_linear_names(model):
cls = torch.nn.Linear
lora_module_names = set()
for name, module in model.named_modules():
if isinstance(module, cls):
names = name.split('.')
lora_module_names.add(names[0] if len(names) == 1 else names[-1])
if 'lm_head' in lora_module_names:
lora_module_names.remove('lm_head')
return list(lora_module_names)
if LoRA:
loraconfig = LoraConfig(
r=128,
lora_alpha=256,
target_modules=find_all_linear_names(model),
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
)
model = get_peft_model(model, loraconfig)
if args.gradient_checkpointing:
model.gradient_checkpointing_enable()
return model,tokenizer
def deepspeed_train(args,ds,model,tokenizer):
set_seed(args.seed)
set_cuda(deterministic=deterministic)
print('initializing deepspeed \n')
model_parameters = list(filter(lambda p: p.requires_grad, model.parameters()))
optimizer=AdamW(model_parameters, lr=lr, betas=(0.9, 0.999),eps=1e-8,weight_decay=0.05)
def _get_cosine_schedule_with_warmup_lr_lambda(current_step: int, *, num_warmup_steps: int, num_training_steps: int, num_cycles: float):
if current_step < num_warmup_steps:
return float(current_step) / float(max(1, num_warmup_steps))
progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
lr_lambda = partial(
_get_cosine_schedule_with_warmup_lr_lambda,
num_warmup_steps=100,
num_training_steps=num_epochs*steps,
num_cycles=0.5,
)
scheduler=LambdaLR(optimizer, lr_lambda)
model_engine, optimizer, dataloader, lr_scheduler = deepspeed.initialize(config=args.deepspeed_config, model=model, model_parameters=model_parameters,training_data=ds,optimizer=optimizer, lr_scheduler=scheduler,)
torch.cuda.empty_cache()
print('starting training \n')
training_params_save={
'epoch':[],
'step':[],
'loss':[],
'lr':[],
'cost_time':[],
'total_cost_time':[],
'percent':[],
}
total_start_time=time()
for epoch in range(1,num_epochs+1):
for step, batch in enumerate(dataloader, start=1):
batch=tokenize_batch(batch,tokenizer)
start_time=time()
batch.to(model_engine.device)
loss = model_engine(input_ids=batch['input_ids'],attention_mask=batch['attention_mask']).loss
model_engine.backward(loss)
model_engine.step()
if step % steps_per_print == 0:
percent=step*args.deepspeed_config['train_micro_batch_size_per_gpu']*num_gpus/len(ds)
cost_time=time()-start_time
total_cost_time=time()-total_start_time
print(f'epoch {epoch} step {step} {percent:.2%} loss:{loss:8.3f} time:{cost_time:.2f} total time:{total_cost_time:.2f}\n')
meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
print(f'GPU State --- Free:{meminfo.free/1024**2} Used:{meminfo.used/1024**2} Total:{meminfo.total/1024**2}\n')
utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
print(f'GPU Util --- {utilization.gpu}%\n')
training_params_save['epoch'].append(epoch)
training_params_save['step'].append(step)
training_params_save['loss'].append(float(loss))
training_params_save['lr'].append(optimizer.param_groups[0]['lr'])
training_params_save['cost_time'].append(round(cost_time,4))
training_params_save['total_cost_time'].append(round(total_cost_time,4))
training_params_save['percent'].append(round(percent,4))
if save_flag and step%steps_per_save==0:
save_model_tokenizer_config_params(args,model_engine,tokenizer,training_params_save,epoch,step)
if save_flag:
save_model_tokenizer_config_params(args,model_engine,tokenizer,training_params_save,epoch)
if __name__ == '__main__':
args=argparse.Namespace()
args.seed = 0
model_name='Qwen2p5Coder_1p5B'
args.model = f'/home/chenzy/models/{model_name}'
args.training_dataset = '/home/chenzy/sysu_datasets/SolRet_training'
args.save_dir=f'/home/chenzy/models/save/{model_name}_SolRet'
ds=load_training_dataset(args, args.training_dataset)
lr=1e-4
train_micro_batch_size_per_gpu=128
gradient_accumulation_steps=1
num_gpus=4
dataset_len=len(ds)
steps=dataset_len//(num_gpus*train_micro_batch_size_per_gpu)
select_dataset_num=steps*num_gpus*train_micro_batch_size_per_gpu
ds=ds.select(range(select_dataset_num))
num_epochs=3
steps_per_print=25
steps_per_save=steps//4
args.gradient_checkpointing=True
save_flag=True
deterministic=True
LoRA=False
pynvml.nvmlInit()
handle = pynvml.nvmlDeviceGetHandleByIndex(0)
model,tokenizer=initialize_model_tokenizer(args)
DEEPSPEED_CONFIG = \
{
'fp16': {'enabled':True},
'zero_optimization': {'stage': 0},
'train_micro_batch_size_per_gpu': train_micro_batch_size_per_gpu,
'gradient_accumulation_steps': gradient_accumulation_steps,
'gradient_clipping': 1.0,
'steps_per_print': steps_per_print*5,
'wall_clock_breakdown': False,
'compression_training': {'weight_quantization': {'shared_parameters': {}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {}, 'different_groups': {}}}
}
args.deepspeed_config = DEEPSPEED_CONFIG
deepspeed_train(args=args,ds=ds,model=model,tokenizer=tokenizer)