Commit
·
ad52ebd
1
Parent(s):
b85543c
fix typo & dtype
Browse files- README.md +5 -4
- generation_demo.py +5 -4
- modeling_muddformer.py +1 -1
README.md
CHANGED
|
@@ -30,13 +30,14 @@ import os
|
|
| 30 |
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
|
| 31 |
|
| 32 |
device = torch.device('cuda:0')
|
|
|
|
| 33 |
MAX_BATCH_SIZE = 1
|
| 34 |
MAX_SEQ_LENGTH = 2048
|
| 35 |
NUM_TOKENS_TO_GENERATE = 10
|
| 36 |
COMPILE = True
|
| 37 |
-
|
| 38 |
|
| 39 |
-
if
|
| 40 |
import torch._dynamo.config
|
| 41 |
import torch._inductor.config
|
| 42 |
torch._dynamo.config.cache_size_limit = 64
|
|
@@ -47,9 +48,9 @@ if OPTIMIZED_COMPPILE:
|
|
| 47 |
tokenizer = AutoTokenizer.from_pretrained("Caiyun-AI/MUDDFormer-2.8B")
|
| 48 |
model = AutoModelForCausalLM.from_pretrained("Caiyun-AI/MUDDFormer-2.8B", trust_remote_code=True)
|
| 49 |
|
| 50 |
-
_ = model.to(device=device,dtype=
|
| 51 |
with torch.device(device):
|
| 52 |
-
model.setup_caches(max_batch_size=MAX_BATCH_SIZE, max_seq_length=MAX_SEQ_LENGTH)
|
| 53 |
|
| 54 |
def decode_one_token(model, cur_token, input_pos):
|
| 55 |
logits = model(cur_token, input_pos=input_pos, return_tensor=True)
|
|
|
|
| 30 |
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
|
| 31 |
|
| 32 |
device = torch.device('cuda:0')
|
| 33 |
+
dtype = torch.bfloat16
|
| 34 |
MAX_BATCH_SIZE = 1
|
| 35 |
MAX_SEQ_LENGTH = 2048
|
| 36 |
NUM_TOKENS_TO_GENERATE = 10
|
| 37 |
COMPILE = True
|
| 38 |
+
OPTIMIZED_COMPILE = False
|
| 39 |
|
| 40 |
+
if OPTIMIZED_COMPILE:
|
| 41 |
import torch._dynamo.config
|
| 42 |
import torch._inductor.config
|
| 43 |
torch._dynamo.config.cache_size_limit = 64
|
|
|
|
| 48 |
tokenizer = AutoTokenizer.from_pretrained("Caiyun-AI/MUDDFormer-2.8B")
|
| 49 |
model = AutoModelForCausalLM.from_pretrained("Caiyun-AI/MUDDFormer-2.8B", trust_remote_code=True)
|
| 50 |
|
| 51 |
+
_ = model.to(device=device,dtype=dtype)
|
| 52 |
with torch.device(device):
|
| 53 |
+
model.setup_caches(max_batch_size=MAX_BATCH_SIZE, max_seq_length=MAX_SEQ_LENGTH, dtype=dtype)
|
| 54 |
|
| 55 |
def decode_one_token(model, cur_token, input_pos):
|
| 56 |
logits = model(cur_token, input_pos=input_pos, return_tensor=True)
|
generation_demo.py
CHANGED
|
@@ -6,13 +6,14 @@ import os
|
|
| 6 |
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
|
| 7 |
|
| 8 |
device = torch.device('cuda:0')
|
|
|
|
| 9 |
MAX_BATCH_SIZE = 1
|
| 10 |
MAX_SEQ_LENGTH = 2048
|
| 11 |
NUM_TOKENS_TO_GENERATE = 10
|
| 12 |
COMPILE = True
|
| 13 |
-
|
| 14 |
|
| 15 |
-
if
|
| 16 |
import torch._dynamo.config
|
| 17 |
import torch._inductor.config
|
| 18 |
torch._dynamo.config.cache_size_limit = 64
|
|
@@ -23,9 +24,9 @@ if OPTIMIZED_COMPPILE:
|
|
| 23 |
tokenizer = AutoTokenizer.from_pretrained("Caiyun-AI/MUDDFormer-2.8B")
|
| 24 |
model = AutoModelForCausalLM.from_pretrained("Caiyun-AI/MUDDFormer-2.8B", trust_remote_code=True)
|
| 25 |
|
| 26 |
-
_ = model.to(device=device,dtype=
|
| 27 |
with torch.device(device):
|
| 28 |
-
model.setup_caches(max_batch_size=MAX_BATCH_SIZE, max_seq_length=MAX_SEQ_LENGTH)
|
| 29 |
|
| 30 |
def decode_one_token(model, cur_token, input_pos):
|
| 31 |
logits = model(cur_token, input_pos=input_pos, return_tensor=True)
|
|
|
|
| 6 |
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
|
| 7 |
|
| 8 |
device = torch.device('cuda:0')
|
| 9 |
+
dtype = torch.bfloat16
|
| 10 |
MAX_BATCH_SIZE = 1
|
| 11 |
MAX_SEQ_LENGTH = 2048
|
| 12 |
NUM_TOKENS_TO_GENERATE = 10
|
| 13 |
COMPILE = True
|
| 14 |
+
OPTIMIZED_COMPILE = False
|
| 15 |
|
| 16 |
+
if OPTIMIZED_COMPILE:
|
| 17 |
import torch._dynamo.config
|
| 18 |
import torch._inductor.config
|
| 19 |
torch._dynamo.config.cache_size_limit = 64
|
|
|
|
| 24 |
tokenizer = AutoTokenizer.from_pretrained("Caiyun-AI/MUDDFormer-2.8B")
|
| 25 |
model = AutoModelForCausalLM.from_pretrained("Caiyun-AI/MUDDFormer-2.8B", trust_remote_code=True)
|
| 26 |
|
| 27 |
+
_ = model.to(device=device,dtype=dtype)
|
| 28 |
with torch.device(device):
|
| 29 |
+
model.setup_caches(max_batch_size=MAX_BATCH_SIZE, max_seq_length=MAX_SEQ_LENGTH, dtype=dtype)
|
| 30 |
|
| 31 |
def decode_one_token(model, cur_token, input_pos):
|
| 32 |
logits = model(cur_token, input_pos=input_pos, return_tensor=True)
|
modeling_muddformer.py
CHANGED
|
@@ -119,7 +119,7 @@ class MUDDFormer(PreTrainedModel):
|
|
| 119 |
self.max_batch_size = max_batch_size
|
| 120 |
if not self.config.is_training:
|
| 121 |
if self.use_layer_cache:
|
| 122 |
-
self.layer_cache = LayerCache(max_batch_size, self.config.n_layer, self.config.dim)
|
| 123 |
for b in self.layers:
|
| 124 |
b.attention.kv_cache = KVCache(max_batch_size, max_seq_length, self.config.n_local_heads, head_dim, dtype=dtype)
|
| 125 |
|
|
|
|
| 119 |
self.max_batch_size = max_batch_size
|
| 120 |
if not self.config.is_training:
|
| 121 |
if self.use_layer_cache:
|
| 122 |
+
self.layer_cache = LayerCache(max_batch_size, self.config.n_layer, self.config.dim, dtype=dtype)
|
| 123 |
for b in self.layers:
|
| 124 |
b.attention.kv_cache = KVCache(max_batch_size, max_seq_length, self.config.n_local_heads, head_dim, dtype=dtype)
|
| 125 |
|