TraDo-4B-Instruct

@@ -1,11 +1,8 @@
 # This file is modified based on https://github.com/huggingface/transformers/blob/v4.52.4/src/transformers/models/qwen3/modeling_qwen3.py.
 #
-#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#           This file was automatically generated from src/transformers/models/qwen3/modular_qwen3.py.
-#               Do NOT edit this file manually as any edits will be overwritten by the generation of
-#             the file from the modular. If any change should be done, please apply the change to the
-#                          modular_qwen3.py file directly. One of our CI enforces this.
-#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 # coding=utf-8
 # Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
 #
@@ -49,7 +46,6 @@ from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_u
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from transformers.processing_utils import Unpack
 from transformers.utils import (
-    LossKwargs,
     auto_docstring,
     can_return_tuple,
     is_torch_flex_attn_available,
@@ -781,9 +777,6 @@ class SDARModel(SDARPreTrainedModel):
         return causal_mask
-class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
 @auto_docstring
 class SDARForCausalLM(SDARPreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
@@ -832,7 +825,7 @@ class SDARForCausalLM(SDARPreTrainedModel, GenerationMixin):
         output_hidden_states: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
-        **kwargs: Unpack[KwargsForCausalLM],
     ) -> CausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

+# SPDX-License-Identifier: MIT
+# Adapted from https://huggingface.co/Gen-Verse/TraDo-8B-Instruct/blob/main/modeling_sdar.py
 # This file is modified based on https://github.com/huggingface/transformers/blob/v4.52.4/src/transformers/models/qwen3/modeling_qwen3.py.
 #
 # coding=utf-8
 # Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
 #
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from transformers.processing_utils import Unpack
 from transformers.utils import (
     auto_docstring,
     can_return_tuple,
     is_torch_flex_attn_available,
         return causal_mask
 @auto_docstring
 class SDARForCausalLM(SDARPreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
         output_hidden_states: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: dict,
     ) -> CausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):