Hilbertmeng commited on
Commit
b85543c
·
1 Parent(s): c69b327

update README

Browse files
Files changed (2) hide show
  1. README.md +71 -0
  2. generation_demo.py +2 -2
README.md CHANGED
@@ -1,3 +1,74 @@
1
  ---
 
 
 
 
 
 
2
  license: mit
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ language:
3
+ - en
4
+ tags:
5
+ - pytorch
6
+ - causal-lm
7
+ - muddformer
8
  license: mit
9
  ---
10
+ MUDDFormer-2.8B is a pretrained language model on the Pile with 300B tokens, which uses a simple yet effective method to address the limitations of residual connections and enhance cross-layer information flow in Transformers. Please see downstrem evaluations and more details in the paper[(MUDDFormer: Breaking Residual Bottlenecks in Transformers via Multiway Dynamic Dense Connections)](https://arxiv.org). In addition, we open-source Jax training code on [(Github)](https://github.com/Caiyun-AI/MUDDFormer/).
11
+
12
+ We recommend <strong>compiled version</strong> of MUDDFormer with *torch.compile* for inference acceleration. Please refer to Generation section for compile implementation.
13
+
14
+ # Usage
15
+
16
+ ## Env
17
+
18
+ ```
19
+ pip install transformers==4.35.0 torch==2.5.1
20
+ ```
21
+
22
+ ## Generation
23
+
24
+ ```
25
+ import time
26
+ from transformers import AutoTokenizer, AutoModelForCausalLM
27
+ import torch
28
+
29
+ import os
30
+ os.environ['TOKENIZERS_PARALLELISM'] = 'false'
31
+
32
+ device = torch.device('cuda:0')
33
+ MAX_BATCH_SIZE = 1
34
+ MAX_SEQ_LENGTH = 2048
35
+ NUM_TOKENS_TO_GENERATE = 10
36
+ COMPILE = True
37
+ OPTIMIZED_COMPPILE = False
38
+
39
+ if OPTIMIZED_COMPPILE:
40
+ import torch._dynamo.config
41
+ import torch._inductor.config
42
+ torch._dynamo.config.cache_size_limit = 64
43
+ torch._inductor.config.coordinate_descent_tuning = True
44
+ torch._inductor.config.triton.unique_kernel_names = True
45
+ torch._inductor.config.fx_graph_cache = True
46
+
47
+ tokenizer = AutoTokenizer.from_pretrained("Caiyun-AI/MUDDFormer-2.8B")
48
+ model = AutoModelForCausalLM.from_pretrained("Caiyun-AI/MUDDFormer-2.8B", trust_remote_code=True)
49
+
50
+ _ = model.to(device=device,dtype=torch.bfloat16)
51
+ with torch.device(device):
52
+ model.setup_caches(max_batch_size=MAX_BATCH_SIZE, max_seq_length=MAX_SEQ_LENGTH)
53
+
54
+ def decode_one_token(model, cur_token, input_pos):
55
+ logits = model(cur_token, input_pos=input_pos, return_tensor=True)
56
+ new_token = torch.argmax(logits[:, -1], dim=-1)[:,None]
57
+ return new_token
58
+
59
+ prompt = "Beijing is the capital of China. London is the capital of"
60
+ input_ids = tokenizer.encode(prompt, return_tensors='pt')
61
+
62
+ compiled_decode_one_token = torch.compile(decode_one_token,mode="reduce-overhead", fullgraph=True) if COMPILE else None
63
+
64
+ print('Start generating tokens, but it will take a few minutes to compile at the first time.')
65
+ for i in range(10):
66
+ t0 = time.time()
67
+ with torch.no_grad():
68
+ generated_ids = model.generate(input_ids.to(device),num_tokens_to_generate=NUM_TOKENS_TO_GENERATE, compiled_decode_one_token=compiled_decode_one_token)
69
+ text = tokenizer.decode(generated_ids[0])
70
+ if i ==0:
71
+ print(f'Generated text: {text}')
72
+ t1 = time.time()
73
+ print(f'Time consumed at iteration {i}: {t1-t0}s')
74
+ ```
generation_demo.py CHANGED
@@ -10,9 +10,9 @@ MAX_BATCH_SIZE = 1
10
  MAX_SEQ_LENGTH = 2048
11
  NUM_TOKENS_TO_GENERATE = 10
12
  COMPILE = True
13
- OPTIMAZED_COMPPILE = False
14
 
15
- if OPTIMAZED_COMPPILE:
16
  import torch._dynamo.config
17
  import torch._inductor.config
18
  torch._dynamo.config.cache_size_limit = 64
 
10
  MAX_SEQ_LENGTH = 2048
11
  NUM_TOKENS_TO_GENERATE = 10
12
  COMPILE = True
13
+ OPTIMIZED_COMPPILE = False
14
 
15
+ if OPTIMIZED_COMPPILE:
16
  import torch._dynamo.config
17
  import torch._inductor.config
18
  torch._dynamo.config.cache_size_limit = 64