YongganFu commited on
Commit
d456d2c
·
verified ·
1 Parent(s): f203027

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +30 -14
README.md CHANGED
@@ -12,31 +12,47 @@ Docker path: `/lustre/fsw/portfolios/nvr/users/yongganf/docker/megatron_py25_fla
12
 
13
  ## Chat with Fast-SLM-2.7B
14
 
 
 
15
  ```
16
  from transformers import AutoModelForCausalLM, AutoTokenizer
17
  import torch
18
 
19
- repo_name = "YongganFu/Fast_SLM_2_7B"
 
20
  tokenizer = AutoTokenizer.from_pretrained(repo_name, trust_remote_code=True)
21
- model = AutoModelForCausalLM.from_pretrained(repo_name, trust_remote_code=True).cuda().to(torch.bfloat16)
 
22
 
23
 
24
- def chat_with_model(prompt, model, tokenizer, max_length=64):
25
- inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
26
-
27
- outputs = model.generate(**inputs, max_length=max_length, do_sample=False, temperature=0.7, use_cache=True)
28
 
29
- response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
30
- return response
 
 
 
 
31
 
32
- print("Chat with the model (type 'exit' to quit):")
33
  while True:
34
- print("User:")
35
- prompt = input()
36
  if prompt.lower() == "exit":
37
  break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
- response = chat_with_model(prompt, model, tokenizer)
40
-
41
- print(f"Model: {response}")
42
  ```
 
12
 
13
  ## Chat with Fast-SLM-2.7B
14
 
15
+ We wrap the model into CUDA Graph for fast generation:
16
+
17
  ```
18
  from transformers import AutoModelForCausalLM, AutoTokenizer
19
  import torch
20
 
21
+ repo_name = "nvidia/Fast_SLM_2_7B"
22
+
23
  tokenizer = AutoTokenizer.from_pretrained(repo_name, trust_remote_code=True)
24
+ model = AutoModelForCausalLM.from_pretrained(repo_name, trust_remote_code=True)
25
+ model = model.cuda().to(torch.bfloat16)
26
 
27
 
28
+ max_new_tokens = 256
 
 
 
29
 
30
+ print('Initializing generation state...')
31
+ generation_state = model.init_cuda_graph_generation(
32
+ max_new_tokens=max_new_tokens,
33
+ batch_size=1,
34
+ device='cuda',
35
+ )
36
 
 
37
  while True:
38
+ prompt = input("User:")
 
39
  if prompt.lower() == "exit":
40
  break
41
+
42
+ inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
43
+
44
+ print(f"Generating with CUDA graph acceleration...")
45
+ outputs = model.generate_with_cuda_graph(
46
+ input_ids=inputs["input_ids"],
47
+ generation_state=generation_state,
48
+ max_new_tokens=max_new_tokens,
49
+ temperature=0,
50
+ top_k=50,
51
+ eos_token_id=tokenizer.eos_token_id,
52
+ profiling=False,
53
+ )
54
+
55
+ response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
56
 
57
+ print(f"Response: {response}")
 
 
58
  ```