jiang-cc
/

AD-Copilot-Thinking

@@ -230,6 +230,13 @@ class OptimizedCrossAttention(nn.Module):
         if hasattr(self.config, '_attn_implementation') and self.config._attn_implementation != "eager":
             attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
         # 执行 attention 计算
         attn_output, _ = attention_interface(
             self,
@@ -237,13 +244,18 @@ class OptimizedCrossAttention(nn.Module):
             k,
             v,
             attention_mask=attention_mask,
             dropout=0.0 if not self.training else self.attention_dropout,
             scaling=self.scaling,
             is_causal=False,
             **kwargs,
         )
-        # 重塑输出
         attn_output = attn_output.transpose(1, 2).contiguous()  # [batch_size, seq_len_q, num_heads, head_dim]
         attn_output = attn_output.reshape(batch_size, seq_len_q, self.dim)  # [batch_size, seq_len_q, hidden_size]

         if hasattr(self.config, '_attn_implementation') and self.config._attn_implementation != "eager":
             attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        # 构造 cu_seqlens 参数（FlashAttention 必需）
+        cu_seqlens_q = torch.arange(0, (batch_size + 1) * seq_len_q, step=seq_len_q, dtype=torch.int32, device=q.device)
+        if self.is_cross_attention and key_value_states is not None:
+            cu_seqlens_k = torch.arange(0, (batch_size + 1) * seq_len_kv, step=seq_len_kv, dtype=torch.int32, device=k.device)
+        else:
+            cu_seqlens_k = cu_seqlens_q
         # 执行 attention 计算
         attn_output, _ = attention_interface(
             self,
             k,
             v,
             attention_mask=attention_mask,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            max_seqlen_q=seq_len_q,
+            max_seqlen_k=seq_len_kv if self.is_cross_attention and key_value_states is not None else seq_len_q,
             dropout=0.0 if not self.training else self.attention_dropout,
             scaling=self.scaling,
             is_causal=False,
             **kwargs,
         )
+        attn_output = attn_output.reshape(batch_size, self.num_heads, seq_len_q, self.head_dim)
         attn_output = attn_output.transpose(1, 2).contiguous()  # [batch_size, seq_len_q, num_heads, head_dim]
         attn_output = attn_output.reshape(batch_size, seq_len_q, self.dim)  # [batch_size, seq_len_q, hidden_size]

tokenizer_config.json CHANGED Viewed

@@ -202,8 +202,12 @@
   "eos_token": "<|im_end|>",
   "errors": "replace",
   "extra_special_tokens": {},
   "model_max_length": 131072,
   "pad_token": "<|endoftext|>",
   "processor_class": "YangJianProcessor",
   "split_special_tokens": false,
   "tokenizer_class": "Qwen2Tokenizer",

   "eos_token": "<|im_end|>",
   "errors": "replace",
   "extra_special_tokens": {},
+  "max_length": null,
   "model_max_length": 131072,
+  "pad_to_multiple_of": null,
   "pad_token": "<|endoftext|>",
+  "pad_token_type_id": 0,
+  "padding_side": "right",
   "processor_class": "YangJianProcessor",
   "split_special_tokens": false,
   "tokenizer_class": "Qwen2Tokenizer",