initial clean commit

1faccd4 about 2 months ago

6.75 kB

	# Copyright 2024 Bytedance Ltd. and/or its affiliates
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	from dataclasses import dataclass
	from typing import Optional, Union

	import torch
	from transformers.cache_utils import Cache
	from transformers.modeling_outputs import CausalLMOutputWithPast


	@dataclass
	class CausalLMOutputForPPO(CausalLMOutputWithPast):
	log_probs: Optional[torch.FloatTensor] = None
	entropy: Optional[torch.FloatTensor] = None


	def forward_base_model(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[Cache] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	cache_position: Optional[torch.LongTensor] = None,
	) -> CausalLMOutputWithPast:
	r"""
	Copy paste LLaMa's forward
	https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/transformers/model/llama.py

	This function should be generic enough for all pure text models.
	```"""

	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)

	# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
	outputs = self.model(
	input_ids=input_ids,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_values=past_key_values,
	inputs_embeds=inputs_embeds,
	use_cache=use_cache,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	cache_position=cache_position,
	)

	return outputs


	def forward_with_torch_backend(
	self,
	input_ids: torch.LongTensor = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[Union["Cache", list[torch.FloatTensor]]] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	labels: Optional[torch.LongTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	cache_position: Optional[torch.LongTensor] = None,
	logits_to_keep: int \| torch.Tensor = 0,
	temperature: float = 1.0,
	**loss_kwargs,
	) -> tuple \| CausalLMOutputForPPO:
	from verl.utils.experimental.torch_functional import FusedLinearForPPO

	outputs = forward_base_model(
	self,
	input_ids=input_ids,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_values=past_key_values,
	inputs_embeds=inputs_embeds,
	use_cache=use_cache,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	cache_position=cache_position,
	)

	hidden_states = outputs[0]

	if not return_dict:
	raise NotImplementedError("forward_with_torch_backend has to return_dict")

	# Loss calculations
	if labels is not None:
	rolled_labels = torch.roll(labels, shifts=-1, dims=-1)
	elif input_ids is not None:
	rolled_labels = torch.roll(input_ids, shifts=-1, dims=-1)
	else:
	raise RuntimeError("To use forward_with_torch_backend, either labels or input_ids must be provided.")

	fused_linear_for_ppo = FusedLinearForPPO()
	log_probs, entropy = fused_linear_for_ppo.forward(
	hidden_states=hidden_states,
	vocab_weights=self.lm_head.weight,
	input_ids=rolled_labels,
	temperature=temperature,
	)

	return CausalLMOutputForPPO(
	log_probs=log_probs,
	entropy=entropy,
	past_key_values=outputs.past_key_values,
	hidden_states=outputs.hidden_states,
	attentions=outputs.attentions,
	)


	def forward_with_triton_backend(
	self,
	input_ids: torch.LongTensor = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[Union["Cache", list[torch.FloatTensor]]] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	labels: Optional[torch.LongTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	cache_position: Optional[torch.LongTensor] = None,
	logits_to_keep: int \| torch.Tensor = 0,
	temperature: float = 1.0,
	**loss_kwargs,
	) -> tuple \| CausalLMOutputForPPO:
	from verl.utils.kernel.linear_cross_entropy import linear_cross_entropy

	outputs = forward_base_model(
	self,
	input_ids=input_ids,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_values=past_key_values,
	inputs_embeds=inputs_embeds,
	use_cache=use_cache,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	cache_position=cache_position,
	)

	hidden_states = outputs[0]

	if not return_dict:
	raise NotImplementedError("forward_with_triton_backend has to return_dict")

	# Loss calculations
	if labels is not None:
	rolled_labels = torch.roll(labels, shifts=-1, dims=-1)
	elif input_ids is not None:
	rolled_labels = torch.roll(input_ids, shifts=-1, dims=-1)
	else:
	raise RuntimeError("To use forward_with_triton_backend, either labels or input_ids must be provided.")

	log_probs, entropy = linear_cross_entropy(
	hidden_states,
	self.lm_head.weight,
	rolled_labels,
	temperature,
	"none",
	)

	return CausalLMOutputForPPO(
	log_probs=log_probs,
	entropy=entropy,
	past_key_values=outputs.past_key_values,
	hidden_states=outputs.hidden_states,
	attentions=outputs.attentions,
	)