Update README.md
Browse files
README.md
CHANGED
|
@@ -180,8 +180,8 @@ def chat_loop(model, tokenizer):
|
|
| 180 |
input_ids=input_ids,
|
| 181 |
max_new_tokens=max_new_tokens,
|
| 182 |
do_sample=True,
|
| 183 |
-
temperature=0
|
| 184 |
-
top_p=0.
|
| 185 |
top_k=40,
|
| 186 |
use_cache=False,
|
| 187 |
pad_token_id=tokenizer.pad_token_id,
|
|
@@ -222,15 +222,15 @@ layer_config = {}
|
|
| 222 |
for i in range(48): # 48 layers
|
| 223 |
prefix = f"model.layers.{i}"
|
| 224 |
|
| 225 |
-
# Attention layers ->
|
| 226 |
if i in [3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47]: # self_attn layers
|
| 227 |
for proj in ["q_proj", "k_proj", "v_proj", "o_proj"]:
|
| 228 |
layer_config[f"{prefix}.self_attn.{proj}"] = {"bits": 16}
|
| 229 |
-
else: # linear_attn layers
|
| 230 |
for proj in ["in_proj_qkvz", "in_proj_ba", "out_proj"]:
|
| 231 |
layer_config[f"{prefix}.linear_attn.{proj}"] = {"bits": 16}
|
| 232 |
|
| 233 |
-
# MLP gate ->
|
| 234 |
layer_config[f"{prefix}.mlp.gate"] = {"bits": 16}
|
| 235 |
|
| 236 |
# shared_expert_gate -> 16-bit (skipped)
|
|
|
|
| 180 |
input_ids=input_ids,
|
| 181 |
max_new_tokens=max_new_tokens,
|
| 182 |
do_sample=True,
|
| 183 |
+
temperature=1.0,
|
| 184 |
+
top_p=0.95,
|
| 185 |
top_k=40,
|
| 186 |
use_cache=False,
|
| 187 |
pad_token_id=tokenizer.pad_token_id,
|
|
|
|
| 222 |
for i in range(48): # 48 layers
|
| 223 |
prefix = f"model.layers.{i}"
|
| 224 |
|
| 225 |
+
# Attention layers -> 16-bit
|
| 226 |
if i in [3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47]: # self_attn layers
|
| 227 |
for proj in ["q_proj", "k_proj", "v_proj", "o_proj"]:
|
| 228 |
layer_config[f"{prefix}.self_attn.{proj}"] = {"bits": 16}
|
| 229 |
+
else: # linear_attn layers -> 16-bit
|
| 230 |
for proj in ["in_proj_qkvz", "in_proj_ba", "out_proj"]:
|
| 231 |
layer_config[f"{prefix}.linear_attn.{proj}"] = {"bits": 16}
|
| 232 |
|
| 233 |
+
# MLP gate -> 16-bit
|
| 234 |
layer_config[f"{prefix}.mlp.gate"] = {"bits": 16}
|
| 235 |
|
| 236 |
# shared_expert_gate -> 16-bit (skipped)
|