# Copyright 2026 Trillion Labs and the HuggingFace Inc. team. All rights reserved. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ GravityMoE model — inherits from DeepSeek V3. GravityMoE shares the same sparse Mixture-of-Experts architecture as DeepSeek V3 (MLA attention, sigmoid routing with bias correction, shared + routed experts) but with different model hyperparameters. All modeling logic is inherited from the DeepSeek V3 implementation in `transformers`. """ from transformers.conversion_mapping import _MODEL_TO_CONVERSION_PATTERN from transformers.models.deepseek_v3.modeling_deepseek_v3 import ( DeepseekV3ForCausalLM, DeepseekV3Model, DeepseekV3PreTrainedModel, ) from .configuration_gravity_moe import GravityMoEConfig # Register weight conversion so that from_pretrained fuses per-expert # checkpoint weights (experts.*.gate_proj, etc.) into 3D tensors # (experts.gate_up_proj, experts.down_proj), same as DeepSeek V3. _MODEL_TO_CONVERSION_PATTERN["gravity_moe"] = "qwen2_moe" class GravityMoEPreTrainedModel(DeepseekV3PreTrainedModel): config_class = GravityMoEConfig _keep_in_fp32_modules_strict = ["e_score_correction_bias"] _keys_to_ignore_on_load_unexpected = [r"model\.layers\.28.*"] class GravityMoEModel(DeepseekV3Model): config_class = GravityMoEConfig class GravityMoEForCausalLM(DeepseekV3ForCausalLM): config_class = GravityMoEConfig __all__ = [ "GravityMoEPreTrainedModel", "GravityMoEModel", "GravityMoEForCausalLM", ]