| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| """ |
| GravityMoE model — inherits from DeepSeek V3. |
| |
| GravityMoE shares the same sparse Mixture-of-Experts architecture as DeepSeek V3 |
| (MLA attention, sigmoid routing with bias correction, shared + routed experts) |
| but with different model hyperparameters. All modeling logic is inherited from |
| the DeepSeek V3 implementation in `transformers`. |
| """ |
|
|
| from transformers.conversion_mapping import _MODEL_TO_CONVERSION_PATTERN |
| from transformers.models.deepseek_v3.modeling_deepseek_v3 import ( |
| DeepseekV3ForCausalLM, |
| DeepseekV3Model, |
| DeepseekV3PreTrainedModel, |
| ) |
|
|
| from .configuration_gravity_moe import GravityMoEConfig |
|
|
| |
| |
| |
| _MODEL_TO_CONVERSION_PATTERN["gravity_moe"] = "qwen2_moe" |
|
|
|
|
| class GravityMoEPreTrainedModel(DeepseekV3PreTrainedModel): |
| config_class = GravityMoEConfig |
| _keep_in_fp32_modules_strict = ["e_score_correction_bias"] |
| _keys_to_ignore_on_load_unexpected = [r"model\.layers\.28.*"] |
|
|
|
|
| class GravityMoEModel(DeepseekV3Model): |
| config_class = GravityMoEConfig |
|
|
|
|
| class GravityMoEForCausalLM(DeepseekV3ForCausalLM): |
| config_class = GravityMoEConfig |
|
|
|
|
| __all__ = [ |
| "GravityMoEPreTrainedModel", |
| "GravityMoEModel", |
| "GravityMoEForCausalLM", |
| ] |
|
|