Pull updates from branch 'main' of https://huggingface.co/Qwen/Qwen-7B-Chat-Int4.

Files changed (3) hide show

README.md CHANGED Viewed

@@ -18,7 +18,7 @@ inference: false
 <p align="center">
         🤗 <a href="https://huggingface.co/Qwen">Hugging Face</a>&nbsp&nbsp | &nbsp&nbsp🤖 <a href="https://modelscope.cn/organization/qwen">ModelScope</a>&nbsp&nbsp | &nbsp&nbsp 📑 <a href="https://arxiv.org/abs/2309.16609">Paper</a> &nbsp&nbsp ｜ &nbsp&nbsp🖥️ <a href="https://modelscope.cn/studios/qwen/Qwen-7B-Chat-Demo/summary">Demo</a>
 <br>
-<a href="assets/wechat.png">WeChat (微信)</a>&nbsp&nbsp | &nbsp&nbsp<a href="https://discord.gg/z3GAxXZ9Ce">Discord</a>&nbsp&nbsp ｜  &nbsp&nbsp<a href="https://dashscope.aliyun.com">API</a>
 </p>
 <br>

 <p align="center">
         🤗 <a href="https://huggingface.co/Qwen">Hugging Face</a>&nbsp&nbsp | &nbsp&nbsp🤖 <a href="https://modelscope.cn/organization/qwen">ModelScope</a>&nbsp&nbsp | &nbsp&nbsp 📑 <a href="https://arxiv.org/abs/2309.16609">Paper</a> &nbsp&nbsp ｜ &nbsp&nbsp🖥️ <a href="https://modelscope.cn/studios/qwen/Qwen-7B-Chat-Demo/summary">Demo</a>
 <br>
+<a href="https://github.com/QwenLM/Qwen/blob/main/assets/wechat.png">WeChat (微信)</a>&nbsp&nbsp | &nbsp&nbsp<a href="https://discord.gg/z3GAxXZ9Ce">Discord</a>&nbsp&nbsp ｜  &nbsp&nbsp<a href="https://dashscope.aliyun.com">API</a>
 </p>
 <br>

assets/wechat.png CHANGED Viewed

modeling_qwen.py CHANGED Viewed

@@ -540,9 +540,7 @@ class QWenAttention(nn.Module):
             if not self.use_cache_quantization and SUPPORT_TORCH2:
                 if attention_mask is not None:
-                    attention_mask = attention_mask.expand(
-                        -1, -1, causal_mask.size(2), -1
-                    )
                     if causal_mask is not None:
                         attention_mask = attention_mask.masked_fill(~causal_mask, torch.finfo(query.dtype).min)
                 else:
@@ -1356,7 +1354,7 @@ def apply_rotary_pos_emb(t, freqs):
       t (tensor(batch_size, seq_len, n_head, head_dim)):
         the input embedding/hidden states
       freqs (list[tensor(1, seq_len, 1, rotary_dim), tensor(1, seq_len, 1, rotary_dim)]):
-        the cached cos/sin position embeddings
     """
     rot_dim = freqs[0].shape[-1]
     cos, sin = freqs

             if not self.use_cache_quantization and SUPPORT_TORCH2:
                 if attention_mask is not None:
+                    attention_mask = attention_mask.expand(-1, -1, query.size(2), -1)
                     if causal_mask is not None:
                         attention_mask = attention_mask.masked_fill(~causal_mask, torch.finfo(query.dtype).min)
                 else:
       t (tensor(batch_size, seq_len, n_head, head_dim)):
         the input embedding/hidden states
       freqs (list[tensor(1, seq_len, 1, rotary_dim), tensor(1, seq_len, 1, rotary_dim)]):
+        the cached cos/sin position embeddings
     """
     rot_dim = freqs[0].shape[-1]
     cos, sin = freqs