Training in progress - step 1000

Browse files

Files changed (5) hide show

asr_modeling.py +4 -1
chat_template.jinja +94 -89
projectors.py +155 -136
tokenizer.json +2 -2
tokenizer_config.json +9 -7

asr_modeling.py CHANGED Viewed

@@ -419,7 +419,10 @@ class ASRModel(PreTrainedModel, GenerationMixin):
             # Compute per-sample encoder output lengths using conv formulas
             encoder_lengths = self._compute_encoder_output_lengths(audio_attention_mask)
             token_counts = torch.tensor(
-                [self.projector.get_output_length(int(length.item())) for length in encoder_lengths],
                 device=audio_embeds.device,
             )

             # Compute per-sample encoder output lengths using conv formulas
             encoder_lengths = self._compute_encoder_output_lengths(audio_attention_mask)
             token_counts = torch.tensor(
+                [
+                    self.projector.get_output_length(int(length.item()))
+                    for length in encoder_lengths
+                ],
                 device=audio_embeds.device,
             )

chat_template.jinja CHANGED Viewed

@@ -1,89 +1,94 @@
-{%- if tools %}
-    {{- '<|im_start|>system\n' }}
-    {%- if messages[0].role == 'system' %}
-        {{- messages[0].content + '\n\n' }}
-    {%- endif %}
-    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
-    {%- for tool in tools %}
-        {{- "\n" }}
-        {{- tool | tojson }}
-    {%- endfor %}
-    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
-{%- else %}
-    {%- if messages[0].role == 'system' %}
-        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
-    {%- endif %}
-{%- endif %}
-{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
-{%- for message in messages[::-1] %}
-    {%- set index = (messages|length - 1) - loop.index0 %}
-    {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
-        {%- set ns.multi_step_tool = false %}
-        {%- set ns.last_query_index = index %}
-    {%- endif %}
-{%- endfor %}
-{%- for message in messages %}
-    {%- if message.content is string %}
-        {%- set content = message.content %}
-    {%- else %}
-        {%- set content = '' %}
-    {%- endif %}
-    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
-        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
-    {%- elif message.role == "assistant" %}
-        {%- set reasoning_content = '' %}
-        {%- if message.reasoning_content is string %}
-            {%- set reasoning_content = message.reasoning_content %}
-        {%- else %}
-            {%- if '</think>' in content %}
-                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
-                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
-            {%- endif %}
-        {%- endif %}
-        {%- if loop.index0 > ns.last_query_index %}
-            {%- if loop.last or (not loop.last and reasoning_content) %}
-                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
-            {%- else %}
-                {{- '<|im_start|>' + message.role + '\n' + content }}
-            {%- endif %}
-        {%- else %}
-            {{- '<|im_start|>' + message.role + '\n' + content }}
-        {%- endif %}
-        {%- if message.tool_calls %}
-            {%- for tool_call in message.tool_calls %}
-                {%- if (loop.first and content) or (not loop.first) %}
-                    {{- '\n' }}
-                {%- endif %}
-                {%- if tool_call.function %}
-                    {%- set tool_call = tool_call.function %}
-                {%- endif %}
-                {{- '<tool_call>\n{"name": "' }}
-                {{- tool_call.name }}
-                {{- '", "arguments": ' }}
-                {%- if tool_call.arguments is string %}
-                    {{- tool_call.arguments }}
-                {%- else %}
-                    {{- tool_call.arguments | tojson }}
-                {%- endif %}
-                {{- '}\n</tool_call>' }}
-            {%- endfor %}
-        {%- endif %}
-        {{- '<|im_end|>\n' }}
-    {%- elif message.role == "tool" %}
-        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
-            {{- '<|im_start|>user' }}
-        {%- endif %}
-        {{- '\n<tool_response>\n' }}
-        {{- content }}
-        {{- '\n</tool_response>' }}
-        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
-            {{- '<|im_end|>\n' }}
-        {%- endif %}
-    {%- endif %}
-{%- endfor %}
-{%- if add_generation_prompt %}
-    {{- '<|im_start|>assistant\n' }}
-    {%- if true %}
-        {{- '<think>\n\n</think>\n\n' }}
-    {%- endif %}
-{%- endif %}

+{# ───── defaults ───── #}
+{%- if enable_thinking is not defined -%}
+{%- set enable_thinking = true -%}
+{%- endif -%}
+{# ───── reasoning mode ───── #}
+{%- if enable_thinking -%}
+  {%- set reasoning_mode = "/think" -%}
+{%- else -%}
+  {%- set reasoning_mode = "/no_think" -%}
+{%- endif -%}
+{# ───── header (system message) ───── #}
+{{- "<|im_start|>system\n" -}}
+{%- if messages[0].role == "system" -%}
+  {%- set system_message = messages[0].content -%}
+  {%- if "/no_think" in system_message -%}
+    {%- set reasoning_mode = "/no_think" -%}
+  {%- elif "/think" in system_message -%}
+    {%- set reasoning_mode = "/think" -%}
+  {%- endif -%}
+  {%- set custom_instructions = system_message.replace("/no_think", "").replace("/think", "").rstrip() -%}
+{%- endif -%}
+{%- if "/system_override" in system_message -%}
+  {{- custom_instructions.replace("/system_override", "").rstrip() -}}
+  {{- "<|im_end|>\n" -}}
+{%- else -%}
+  {{- "## Metadata\n\n" -}}
+  {{- "Knowledge Cutoff Date: June 2025\n" -}}
+  {%- set today = strftime_now("%d %B %Y") -%}
+  {{- "Today Date: " ~ today ~ "\n" -}}
+  {{- "Reasoning Mode: " + reasoning_mode + "\n\n" -}}
+  {{- "## Custom Instructions\n\n" -}}
+  {%- if custom_instructions -%}
+    {{- custom_instructions + "\n\n" -}}
+  {%- elif reasoning_mode == "/think" -%}
+    {{- "You are a helpful AI assistant named SmolLM, trained by Hugging Face. Your role as an assistant involves thoroughly exploring questions through a systematic thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracking, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution using the specified format: <think> Thought section </think> Solution section. In the Thought section, detail your reasoning process in steps. Each step should include detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, systematically present the final solution that you deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed to reach the conclusion.\n\n" -}}
+  {%- else -%}
+    {{- "You are a helpful AI assistant named SmolLM, trained by Hugging Face.\n\n" -}}
+  {%- endif -%}
+  {%- if xml_tools or python_tools or tools -%}
+    {{- "### Tools\n\n" -}}
+    {%- if xml_tools or tools -%}
+      {%- if tools -%}
+        {%- set xml_tools = tools -%}
+      {%- endif -%}
+      {%- set ns = namespace(xml_tool_string="You may call one or more functions to assist with the user query.\nYou are provided with function signatures within <tools></tools> XML tags:\n\n<tools>\n") -%}
+      {%- for tool in xml_tools[:] -%} {# The slicing makes sure that xml_tools is a list #}
+        {%- set ns.xml_tool_string = ns.xml_tool_string ~ (tool | string) ~ "\n" -%}
+      {%- endfor -%}
+      {%- set xml_tool_string = ns.xml_tool_string + "</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call>" -%}
+      {{- xml_tool_string -}}
+    {%- endif -%}
+    {%- if python_tools -%}
+      {%- set ns = namespace(python_tool_string="When you send a message containing Python code between '<code>' and '</code>' tags, it will be executed in a stateful Jupyter notebook environment, and you will then be given the output to continued reasoning in an agentic loop.\n\nYou can use the following tools in your python code like regular functions:\n<tools>\n") -%}
+      {%- for tool in python_tools[:] -%} {# The slicing makes sure that python_tools is a list #}
+        {%- set ns.python_tool_string = ns.python_tool_string ~ (tool | string) ~ "\n" -%}
+      {%- endfor -%}
+      {%- set python_tool_string = ns.python_tool_string + "</tools>\n\nThe state persists between code executions: so variables that you define in one step are still available thereafter." -%}
+      {{- python_tool_string -}}
+    {%- endif -%}
+    {{- "\n\n" -}}
+    {{- "<|im_end|>\n" -}}
+  {%- endif -%}
+{%- endif -%}
+{# ───── main loop ───── #}
+{%- for message in messages -%}
+    {%- set content = message.content if message.content is string else "" -%}
+    {%- if message.role == "user" -%}
+        {{ "<|im_start|>" + message.role + "\n"  + content + "<|im_end|>\n" }}
+    {%- elif message.role == "assistant" -%}
+        {% generation %}
+        {%- if reasoning_mode == "/think" -%}
+            {{ "<|im_start|>assistant\n" + content.lstrip("\n") + "<|im_end|>\n" }}
+        {%- else -%}
+            {{ "<|im_start|>assistant\n" + "<think>\n\n</think>\n" + content.lstrip("\n") + "<|im_end|>\n" }}
+        {%- endif -%}
+        {% endgeneration %}
+    {%- elif message.role == "tool" -%}
+    {{ "<|im_start|>" + "user\n"  + content + "<|im_end|>\n" }}
+    {%- endif -%}
+{%- endfor -%}
+{# ───── generation prompt ───── #}
+{%- if add_generation_prompt -%}
+    {%- if reasoning_mode == "/think" -%}
+        {{ "<|im_start|>assistant\n" }}
+    {%- else -%}
+        {{ "<|im_start|>assistant\n" + "<think>\n\n</think>\n"  }}
+    {%- endif -%}
+{%- endif -%}

projectors.py CHANGED Viewed

@@ -87,6 +87,34 @@ class SimpleAdapter(nn.Module):
         return self.fc2(self.act(self.fc1(x)))
 class MOSAProjector(nn.Module):
     """MOSA-Base projector: simple 2-layer ReLU router with 4 simple adapters.
@@ -166,109 +194,18 @@ class MOSAProjector(nn.Module):
 # =============================================================================
-# MoE Projector (Shared Expert + Sparse Routed Experts)
 # =============================================================================
-class SharedMoEBlock(nn.Module):
-    """MoE block with Shared + Sigmoid-Routed Experts."""
-    def __init__(
-        self,
-        input_dim: int,
-        hidden_dim: int,
-        output_dim: int,
-        num_experts: int = 4,
-        top_k: int = 2,
-    ):
-        super().__init__()
-        self.num_experts = num_experts
-        self.top_k = top_k
-        self.output_dim = output_dim
-        # RMSNorm before routing
-        self.norm = LlamaRMSNorm(input_dim, eps=1e-8)
-        self.router = nn.Linear(input_dim, num_experts, bias=False)
-        nn.init.normal_(self.router.weight, mean=0.0, std=0.02)
-        self.shared_expert = SimpleAdapter(input_dim, hidden_dim, output_dim)
-        self.experts = nn.ModuleList(
-            [SimpleAdapter(input_dim, hidden_dim, output_dim) for _ in range(num_experts)]
-        )
-        self.last_router_logits = None
-        self.last_router_probs = None
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        batch_size, seq_len, dim = hidden_states.shape
-        # 1. Apply Shared Expert
-        normed_states = self.norm(hidden_states)
-        shared_out = self.shared_expert(normed_states)
-        # 2. Router Logic (Sigmoid Style)
-        flat_hidden = normed_states.view(-1, dim)
-        router_logits = self.router(flat_hidden)
-        # Sigmoid routing
-        router_probs = torch.sigmoid(router_logits)
-        self.last_router_logits = router_logits
-        self.last_router_probs = router_probs
-        # 3. Top-K Selection
-        top_k_scores, top_k_indices = torch.topk(router_probs, self.top_k, dim=-1)
-        # Normalize weights
-        top_k_weights = top_k_scores / (top_k_scores.sum(dim=-1, keepdim=True) + 1e-6)
-        top_k_weights = top_k_weights.to(hidden_states.dtype)
-        # 4. Dispatch
-        routed_out = self._dispatch_experts(flat_hidden, top_k_indices, top_k_weights)
-        routed_out = routed_out.view(batch_size, seq_len, -1)
-        return shared_out + routed_out
-    def _dispatch_experts(
-        self,
-        hidden_states: torch.Tensor,
-        top_k_indices: torch.Tensor,
-        top_k_weights: torch.Tensor,
-    ) -> torch.Tensor:
-        num_tokens = hidden_states.shape[0]
-        output = torch.zeros(
-            num_tokens, self.output_dim, device=hidden_states.device, dtype=hidden_states.dtype
-        )
-        for expert_idx, expert in enumerate(self.experts):
-            expert_mask = top_k_indices == expert_idx
-            if not expert_mask.any():
-                continue
-            token_indices, slot_indices = torch.where(expert_mask)
-            expert_input = hidden_states[token_indices]
-            expert_output = expert(expert_input).to(output.dtype)
-            weights = top_k_weights[token_indices, slot_indices].unsqueeze(-1)
-            output.index_add_(0, token_indices, expert_output * weights)
-        return output
-def load_balancing_loss(router_probs: torch.Tensor, num_experts: int, top_k: int) -> torch.Tensor:
-    """Auxiliary loss to encourage balanced expert usage."""
-    prob_per_expert = router_probs.mean(dim=0)
-    target_mean = prob_per_expert.mean()
-    return (prob_per_expert - target_mean).square().sum() * num_experts
-def z_loss(router_logits: torch.Tensor) -> torch.Tensor:
-    """Z-loss to prevent router logits from growing too large."""
-    return torch.logsumexp(router_logits.float(), dim=-1).square().mean()
-class MoEAudioProjector(nn.Module):
-    """MoE projector with shared expert + sparse routed experts."""
     def __init__(self, config):
         """Initialize MoE projector.
@@ -279,40 +216,59 @@ class MoEAudioProjector(nn.Module):
         super().__init__()
         self.k = getattr(config, "projector_pool_stride", 4)
-        encoder_dim = config.encoder_dim
-        # Depthwise Conv for temporal mixing
-        self.temporal_conv = nn.Conv1d(
-            encoder_dim, encoder_dim, kernel_size=3, padding=1, groups=encoder_dim
-        )
-        in_dim = encoder_dim * self.k
         out_dim = config.llm_dim
-        hidden_dim = getattr(config, "projector_hidden_dim", None) or in_dim
         self.num_experts = getattr(config, "num_experts", 4)
         self.top_k = getattr(config, "num_experts_per_tok", 2)
-        self.aux_loss_coef = getattr(config, "router_aux_loss_coef", 0.02)
-        self.z_loss_coef = getattr(config, "router_z_loss_coef", 0.001)
-        self.moe = SharedMoEBlock(in_dim, hidden_dim, out_dim, self.num_experts, self.top_k)
         self._init_weights()
     def _init_weights(self):
         with torch.no_grad():
-            nn.init.orthogonal_(self.moe.shared_expert.fc1.weight)
-            nn.init.orthogonal_(self.moe.shared_expert.fc2.weight, gain=0.5)
-            for expert in self.moe.experts:
-                nn.init.orthogonal_(expert.fc1.weight)
-                nn.init.orthogonal_(expert.fc2.weight, gain=0.01)
     def get_output_length(self, input_length: int) -> int:
-        """Calculate output sequence length given input length."""
-        # Temporal pooling with stride k
-        if input_length % self.k:
-            input_length += self.k - input_length % self.k
-        return input_length // self.k
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """Project audio features using shared + sparse MoE.
@@ -323,32 +279,95 @@ class MoEAudioProjector(nn.Module):
         Returns:
             Projected features of shape [batch, out_len, llm_dim]
         """
-        batch_size, seq_len, dim = x.size()
-        target_dtype = self.moe.shared_expert.fc1.weight.dtype
-        if x.dtype != target_dtype:
-            x = x.to(target_dtype)
-        # Temporal Context Injection
-        x_ctx = x.transpose(1, 2)
-        x_ctx = self.temporal_conv(x_ctx)
-        x = x + x_ctx.transpose(1, 2)
-        if seq_len % self.k:
-            x = F.pad(x, (0, 0, 0, self.k - seq_len % self.k))
-        x = x.view(batch_size, -1, dim * self.k)
-        return self.moe(x)
-    def get_aux_loss(self) -> torch.Tensor:
-        if self.moe.last_router_logits is None:
-            return torch.tensor(0.0, device=self.moe.router.weight.device)
-        balance = load_balancing_loss(self.moe.last_router_probs, self.num_experts, self.top_k)
-        z = z_loss(self.moe.last_router_logits)
-        return self.aux_loss_coef * balance + self.z_loss_coef * z
 # =============================================================================

         return self.fc2(self.act(self.fc1(x)))
+class SwiGLU(nn.Module):
+    """SwiGLU activation with gated linear units (used in LLaMA, Mistral, etc.)."""
+    def __init__(self, dim: int, hidden_dim: int, bias: bool = False):
+        super().__init__()
+        self.w1 = nn.Linear(dim, hidden_dim, bias=bias)  # Gate
+        self.w2 = nn.Linear(dim, hidden_dim, bias=bias)  # Value
+        self.w3 = nn.Linear(hidden_dim, dim, bias=bias)  # Output
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w3(F.silu(self.w1(x)) * self.w2(x))
+class AsymmetricSwiGLU(nn.Module):
+    """SwiGLU that handles different input and output dimensions."""
+    def __init__(
+        self, in_features: int, hidden_features: int, out_features: int, bias: bool = False
+    ):
+        super().__init__()
+        self.w1 = nn.Linear(in_features, hidden_features, bias=bias)  # Gate
+        self.w2 = nn.Linear(in_features, hidden_features, bias=bias)  # Value
+        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)  # Output
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w3(F.silu(self.w1(x)) * self.w2(x))
 class MOSAProjector(nn.Module):
     """MOSA-Base projector: simple 2-layer ReLU router with 4 simple adapters.
 # =============================================================================
+# MoE Projector (Pure PyTorch with Shared Expert)
 # =============================================================================
+class MoEAudioProjector(nn.Module):
+    """MoE projector with shared expert (DeepSeek-style), pure PyTorch implementation.
+    Uses 4 sparse experts with top-2 routing plus a shared expert that processes all tokens.
+    No external dependencies (megablocks removed).
+    Architecture matches main branch: norm → experts(in_dim → hidden → out_dim)
+    """
     def __init__(self, config):
         """Initialize MoE projector.
         super().__init__()
         self.k = getattr(config, "projector_pool_stride", 4)
+        self.aux_coef = getattr(config, "router_aux_loss_coef", 0.01)
+        # Stability coefficients
+        self.router_z_loss_coef = getattr(
+            config, "router_z_loss_coef", 1e-4
+        )  # Prevents logit explosion
+        self.router_jitter_noise = getattr(
+            config, "router_jitter_noise", 0.01
+        )  # Prevents expert collapse
+        in_dim = config.encoder_dim * self.k
         out_dim = config.llm_dim
+        # Expert hidden dim (default = output dim)
+        hidden_dim = getattr(config, "projector_hidden_dim", None) or out_dim
+        # Number of experts and top-k selection
         self.num_experts = getattr(config, "num_experts", 4)
         self.top_k = getattr(config, "num_experts_per_tok", 2)
+        # A. Normalize stacked input (like main branch SharedMoEBlock)
+        self.norm = LlamaRMSNorm(in_dim, eps=1e-6)
+        # B. Router (operates on stacked input)
+        self.router = nn.Linear(in_dim, self.num_experts, bias=False)
+        # C. Experts: simple 2-layer MLP (same as MLPAudioProjector)
+        self.experts = nn.ModuleList(
+            [SimpleAdapter(in_dim, hidden_dim, out_dim) for _ in range(self.num_experts)]
+        )
+        # D. Shared Expert (same architecture)
+        self.shared_expert = SimpleAdapter(in_dim, hidden_dim, out_dim)
+        # E. Initialize weights for stable training
         self._init_weights()
+        self.last_aux_loss = torch.tensor(0.0)
     def _init_weights(self):
+        """Initialize weights for stable training start."""
         with torch.no_grad():
+            # Router: small weights -> uniform probability
+            nn.init.normal_(self.router.weight, mean=0.0, std=0.02)
+            # Experts: xavier for fc1, small for fc2 (output)
+            for expert in [self.shared_expert, *self.experts]:
+                nn.init.xavier_uniform_(expert.fc1.weight)
+                nn.init.normal_(expert.fc2.weight, mean=0.0, std=0.01)  # Small init
     def get_output_length(self, input_length: int) -> int:
+        """Calculate output sequence length given input length (matches MLP projector)."""
+        return (input_length - self.k) // self.k + 1
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """Project audio features using shared + sparse MoE.
         Returns:
             Projected features of shape [batch, out_len, llm_dim]
         """
+        # 1. Frame Stacking
+        batch, seq, dim = x.shape
+        out_len = (seq - self.k) // self.k + 1
+        x = x[:, : out_len * self.k, :]
+        x = x.reshape(batch, out_len, dim * self.k)
+        # 2. Normalize stacked input (like main branch SharedMoEBlock)
+        x = self.norm(x)
+        flat_x = x.view(-1, x.size(-1))  # [tokens, in_dim]
+        # 3. Shared Expert (compute first, creates output tensor)
+        output = self.shared_expert(flat_x)
+        # 4. Sparse Experts (in-place add to shared output)
+        self.last_aux_loss = self._forward_sparse(flat_x, output)
+        return output.view(batch, out_len, -1)
+    def _forward_sparse(self, x: torch.Tensor, output: torch.Tensor) -> torch.Tensor:
+        """Stability-hardened sparse expert dispatch (in-place add to output).
+        Args:
+            x: Flattened input of shape [tokens, dim]
+            output: Output tensor to add sparse expert results into (in-place)
+        Returns:
+            Auxiliary loss tensor
+        """
+        # A. Router Logic with Jitter
+        logits = self.router(x)
+        if self.training and self.router_jitter_noise > 0:
+            # Jitter: multiply by uniform noise (1-eps, 1+eps) to shake decision boundary
+            # Prevents router from getting stuck on one expert early in training
+            noise = torch.empty_like(logits).uniform_(
+                1.0 - self.router_jitter_noise, 1.0 + self.router_jitter_noise
+            )
+            logits = logits * noise
+        # Force float32 for softmax (bf16/fp16 exponentials can overflow)
+        probs = torch.softmax(logits, dim=-1, dtype=torch.float32).type_as(x)
+        # B. Top-K Selection
+        top_k_weights, top_k_indices = torch.topk(probs, self.top_k, dim=-1)
+        # Normalize weights so they sum to 1.0
+        top_k_weights = top_k_weights / (top_k_weights.sum(dim=-1, keepdim=True) + 1e-6)
+        # C. Aux Loss + Z-Loss
+        aux_loss = torch.tensor(0.0, device=x.device)
+        if self.training:
+            # Load balancing loss (batch-size invariant)
+            prob_per_expert = probs.mean(0)  # [num_experts]
+            target = 1.0 / self.num_experts
+            balance_loss = (
+                self.aux_coef * ((prob_per_expert - target) ** 2).mean() * self.num_experts
+            )
+            # Z-loss: penalty on large logits to prevent softmax saturation
+            z_loss = self.router_z_loss_coef * torch.logsumexp(logits, dim=-1).pow(2).mean()
+            aux_loss = balance_loss + z_loss
+        # D. Dispatch Loop (in-place add to output)
+        for i, expert in enumerate(self.experts):
+            # Create boolean mask for tokens that selected Expert 'i'
+            mask = top_k_indices == i
+            if mask.any():
+                # token_idx = which tokens, k_idx = 1st or 2nd choice
+                token_idx, k_idx = torch.where(mask)
+                # Gather inputs and compute
+                expert_input = x[token_idx]
+                expert_output = expert(expert_input)
+                # Apply routing weight
+                weight = top_k_weights[token_idx, k_idx].unsqueeze(-1)
+                weighted_output = (expert_output * weight).type_as(output)
+                # Scatter back in-place (index_add_ is atomic and deterministic)
+                output.index_add_(0, token_idx, weighted_output)
+        return aux_loss
+    def get_aux_loss(self) -> torch.Tensor:
+        """Return auxiliary load balancing loss."""
+        return self.last_aux_loss
 # =============================================================================

tokenizer.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:33b674fb8444e2553eae8f1b261093371920a28ef75b5c18f4deb3f9217ed0ba
-size 11422834

 version https://git-lfs.github.com/spec/v1
+oid sha256:d4aeaf198f783cbf58d8cd59812baac429ffe49147bf9648f6618de20b8d4a4c
+size 17209003

tokenizer_config.json CHANGED Viewed

@@ -1,17 +1,19 @@
 {
-  "add_prefix_space": false,
   "backend": "tokenizers",
   "bos_token": null,
-  "clean_up_tokenization_spaces": false,
   "eos_token": "<|im_end|>",
-  "errors": "replace",
   "extra_special_tokens": [
     "<audio>"
   ],
   "is_local": false,
   "model_max_length": 131072,
-  "pad_token": "<|endoftext|>",
-  "split_special_tokens": false,
-  "tokenizer_class": "Qwen2Tokenizer",
-  "unk_token": null
 }

 {
   "backend": "tokenizers",
   "bos_token": null,
+  "clean_up_tokenization_spaces": true,
   "eos_token": "<|im_end|>",
   "extra_special_tokens": [
     "<audio>"
   ],
+  "fast": false,
   "is_local": false,
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
   "model_max_length": 131072,
+  "model_specific_special_tokens": {},
+  "pad_token": "<|finetune_right_pad_id|>",
+  "tokenizer_class": "TokenizersBackend"
 }