sleepyhead111
/

vllm_fairseq

Model card Files Files and versions

vllm_fairseq / fairseq-0.10.2 /fairseq /model_parallel /modules /transformer_layer.py

sleepyhead111's picture

Add files using upload-large-folder tool

3160b62 verified 11 months ago

history blame contribute delete

2.85 kB

	# Copyright (c) Facebook, Inc. and its affiliates.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.

	from fairseq.model_parallel.modules import ModelParallelMultiheadAttention
	from fairseq.modules import TransformerDecoderLayer, TransformerEncoderLayer


	try:
	from fairseq.model_parallel.megatron.mpu import (
	ColumnParallelLinear,
	RowParallelLinear,
	)

	has_megatron_submodule = True
	except (ImportError, ModuleNotFoundError):
	has_megatron_submodule = False


	class ModelParallelTransformerEncoderLayer(TransformerEncoderLayer):
	"""Encoder layer block over multiple gpus.

	See "Megatron-LM: https://arxiv.org/pdf/1909.08053.pdf" for more details.
	"""

	def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size):
	if q_noise > 0:
	raise NotImplementedError
	return ColumnParallelLinear(input_dim, output_dim, gather_output=False)

	def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size):
	if q_noise > 0:
	raise NotImplementedError
	return RowParallelLinear(input_dim, output_dim, input_is_parallel=True)

	def build_self_attention(self, embed_dim, args, **unused_kwargs):
	return ModelParallelMultiheadAttention(
	embed_dim,
	args.encoder_attention_heads,
	dropout=args.attention_dropout,
	self_attention=True,
	)


	class ModelParallelTransformerDecoderLayer(TransformerDecoderLayer):
	"""Decoder layer block.

	See "Megatron-LM: https://arxiv.org/pdf/1909.08053.pdf" for more details.
	"""

	def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size):
	if q_noise > 0:
	raise NotImplementedError
	return ColumnParallelLinear(input_dim, output_dim, gather_output=False)

	def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size):
	if q_noise > 0:
	raise NotImplementedError
	return RowParallelLinear(input_dim, output_dim, input_is_parallel=True)

	def build_self_attention(self, embed_dim, args, **unused_kwargs):
	return ModelParallelMultiheadAttention(
	embed_dim=embed_dim,
	num_heads=args.decoder_attention_heads,
	dropout=args.attention_dropout,
	self_attention=not getattr(args, "cross_self_attention", False),
	)

	def build_encoder_attention(self, embed_dim, args, **unused_kwargs):
	return ModelParallelMultiheadAttention(
	embed_dim=embed_dim,
	num_heads=args.decoder_attention_heads,
	kdim=getattr(args, "encoder_embed_dim", None),
	vdim=getattr(args, "encoder_embed_dim", None),
	dropout=args.attention_dropout,
	encoder_decoder_attention=True,
	)