Caiyun-AI
/

MUDDFormer-2.8B

@@ -30,13 +30,14 @@ import os
 os.environ['TOKENIZERS_PARALLELISM'] = 'false'
 device = torch.device('cuda:0')
 MAX_BATCH_SIZE = 1
 MAX_SEQ_LENGTH = 2048
 NUM_TOKENS_TO_GENERATE = 10
 COMPILE = True
-OPTIMIZED_COMPPILE = False
-if OPTIMIZED_COMPPILE:
     import torch._dynamo.config
     import torch._inductor.config
     torch._dynamo.config.cache_size_limit = 64
@@ -47,9 +48,9 @@ if OPTIMIZED_COMPPILE:
 tokenizer = AutoTokenizer.from_pretrained("Caiyun-AI/MUDDFormer-2.8B")
 model = AutoModelForCausalLM.from_pretrained("Caiyun-AI/MUDDFormer-2.8B", trust_remote_code=True)
-_ = model.to(device=device,dtype=torch.bfloat16)
 with torch.device(device):
-    model.setup_caches(max_batch_size=MAX_BATCH_SIZE, max_seq_length=MAX_SEQ_LENGTH)
 def decode_one_token(model, cur_token, input_pos):
     logits = model(cur_token, input_pos=input_pos, return_tensor=True)

 os.environ['TOKENIZERS_PARALLELISM'] = 'false'
 device = torch.device('cuda:0')
+dtype = torch.bfloat16
 MAX_BATCH_SIZE = 1
 MAX_SEQ_LENGTH = 2048
 NUM_TOKENS_TO_GENERATE = 10
 COMPILE = True
+OPTIMIZED_COMPILE = False
+if OPTIMIZED_COMPILE:
     import torch._dynamo.config
     import torch._inductor.config
     torch._dynamo.config.cache_size_limit = 64
 tokenizer = AutoTokenizer.from_pretrained("Caiyun-AI/MUDDFormer-2.8B")
 model = AutoModelForCausalLM.from_pretrained("Caiyun-AI/MUDDFormer-2.8B", trust_remote_code=True)
+_ = model.to(device=device,dtype=dtype)
 with torch.device(device):
+    model.setup_caches(max_batch_size=MAX_BATCH_SIZE, max_seq_length=MAX_SEQ_LENGTH, dtype=dtype)
 def decode_one_token(model, cur_token, input_pos):
     logits = model(cur_token, input_pos=input_pos, return_tensor=True)

generation_demo.py CHANGED Viewed

@@ -6,13 +6,14 @@ import os
 os.environ['TOKENIZERS_PARALLELISM'] = 'false'
 device = torch.device('cuda:0')
 MAX_BATCH_SIZE = 1
 MAX_SEQ_LENGTH = 2048
 NUM_TOKENS_TO_GENERATE = 10
 COMPILE = True
-OPTIMIZED_COMPPILE = False
-if OPTIMIZED_COMPPILE:
     import torch._dynamo.config
     import torch._inductor.config
     torch._dynamo.config.cache_size_limit = 64
@@ -23,9 +24,9 @@ if OPTIMIZED_COMPPILE:
 tokenizer = AutoTokenizer.from_pretrained("Caiyun-AI/MUDDFormer-2.8B")
 model = AutoModelForCausalLM.from_pretrained("Caiyun-AI/MUDDFormer-2.8B", trust_remote_code=True)
-_ = model.to(device=device,dtype=torch.bfloat16)
 with torch.device(device):
-    model.setup_caches(max_batch_size=MAX_BATCH_SIZE, max_seq_length=MAX_SEQ_LENGTH)
 def decode_one_token(model, cur_token, input_pos):
     logits = model(cur_token, input_pos=input_pos, return_tensor=True)

 os.environ['TOKENIZERS_PARALLELISM'] = 'false'
 device = torch.device('cuda:0')
+dtype = torch.bfloat16
 MAX_BATCH_SIZE = 1
 MAX_SEQ_LENGTH = 2048
 NUM_TOKENS_TO_GENERATE = 10
 COMPILE = True
+OPTIMIZED_COMPILE = False
+if OPTIMIZED_COMPILE:
     import torch._dynamo.config
     import torch._inductor.config
     torch._dynamo.config.cache_size_limit = 64
 tokenizer = AutoTokenizer.from_pretrained("Caiyun-AI/MUDDFormer-2.8B")
 model = AutoModelForCausalLM.from_pretrained("Caiyun-AI/MUDDFormer-2.8B", trust_remote_code=True)
+_ = model.to(device=device,dtype=dtype)
 with torch.device(device):
+    model.setup_caches(max_batch_size=MAX_BATCH_SIZE, max_seq_length=MAX_SEQ_LENGTH, dtype=dtype)
 def decode_one_token(model, cur_token, input_pos):
     logits = model(cur_token, input_pos=input_pos, return_tensor=True)

modeling_muddformer.py CHANGED Viewed

@@ -119,7 +119,7 @@ class MUDDFormer(PreTrainedModel):
         self.max_batch_size = max_batch_size
         if not self.config.is_training:
             if self.use_layer_cache:
-                self.layer_cache = LayerCache(max_batch_size, self.config.n_layer, self.config.dim)
             for b in self.layers:
                 b.attention.kv_cache = KVCache(max_batch_size, max_seq_length, self.config.n_local_heads, head_dim, dtype=dtype)

         self.max_batch_size = max_batch_size
         if not self.config.is_training:
             if self.use_layer_cache:
+                self.layer_cache = LayerCache(max_batch_size, self.config.n_layer, self.config.dim, dtype=dtype)
             for b in self.layers:
                 b.attention.kv_cache = KVCache(max_batch_size, max_seq_length, self.config.n_local_heads, head_dim, dtype=dtype)