L1-16B-A3B / modeling_gravity_moe.py

Upload modeling_gravity_moe.py with huggingface_hub

4ba7bc8 verified about 17 hours ago

2.01 kB

	# Copyright 2026 Trillion Labs and the HuggingFace Inc. team. All rights reserved.

	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""
	GravityMoE model — inherits from DeepSeek V3.

	GravityMoE shares the same sparse Mixture-of-Experts architecture as DeepSeek V3
	(MLA attention, sigmoid routing with bias correction, shared + routed experts)
	but with different model hyperparameters. All modeling logic is inherited from
	the DeepSeek V3 implementation in `transformers`.
	"""

	from transformers.conversion_mapping import _MODEL_TO_CONVERSION_PATTERN
	from transformers.models.deepseek_v3.modeling_deepseek_v3 import (
	DeepseekV3ForCausalLM,
	DeepseekV3Model,
	DeepseekV3PreTrainedModel,
	)

	from .configuration_gravity_moe import GravityMoEConfig

	# Register weight conversion so that from_pretrained fuses per-expert
	# checkpoint weights (experts.*.gate_proj, etc.) into 3D tensors
	# (experts.gate_up_proj, experts.down_proj), same as DeepSeek V3.
	_MODEL_TO_CONVERSION_PATTERN["gravity_moe"] = "qwen2_moe"


	class GravityMoEPreTrainedModel(DeepseekV3PreTrainedModel):
	config_class = GravityMoEConfig
	_keep_in_fp32_modules_strict = ["e_score_correction_bias"]
	_keys_to_ignore_on_load_unexpected = [r"model\.layers\.28.*"]


	class GravityMoEModel(DeepseekV3Model):
	config_class = GravityMoEConfig


	class GravityMoEForCausalLM(DeepseekV3ForCausalLM):
	config_class = GravityMoEConfig


	__all__ = [
	"GravityMoEPreTrainedModel",
	"GravityMoEModel",
	"GravityMoEForCausalLM",
	]