nvidia
/

Nemotron-Flash-3B

Text Generation

Model card Files Files and versions

YongganFu commited on Aug 13, 2025

Commit

d456d2c

·

verified ·

1 Parent(s): f203027

Update README.md

Files changed (1) hide show

README.md +30 -14

README.md CHANGED Viewed

@@ -12,31 +12,47 @@ Docker path: `/lustre/fsw/portfolios/nvr/users/yongganf/docker/megatron_py25_fla
 ## Chat with Fast-SLM-2.7B
 ```
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
-repo_name = "YongganFu/Fast_SLM_2_7B"
 tokenizer = AutoTokenizer.from_pretrained(repo_name, trust_remote_code=True)
-model = AutoModelForCausalLM.from_pretrained(repo_name, trust_remote_code=True).cuda().to(torch.bfloat16)
-def chat_with_model(prompt, model, tokenizer, max_length=64):
-    inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
-    outputs = model.generate(**inputs, max_length=max_length, do_sample=False, temperature=0.7, use_cache=True)
-    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
-    return response
-print("Chat with the model (type 'exit' to quit):")
 while True:
-    print("User:")
-    prompt = input()
     if prompt.lower() == "exit":
         break
-    response = chat_with_model(prompt, model, tokenizer)
-    print(f"Model: {response}")
 ```

 ## Chat with Fast-SLM-2.7B
+We wrap the model into CUDA Graph for fast generation:
 ```
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
+repo_name = "nvidia/Fast_SLM_2_7B"
 tokenizer = AutoTokenizer.from_pretrained(repo_name, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(repo_name, trust_remote_code=True)
+model = model.cuda().to(torch.bfloat16)
+max_new_tokens = 256
+print('Initializing generation state...')
+generation_state = model.init_cuda_graph_generation(
+    max_new_tokens=max_new_tokens,
+    batch_size=1,
+    device='cuda',
+)
 while True:
+    prompt = input("User:")
     if prompt.lower() == "exit":
         break
+    inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
+    print(f"Generating with CUDA graph acceleration...")
+    outputs = model.generate_with_cuda_graph(
+        input_ids=inputs["input_ids"],
+        generation_state=generation_state,
+        max_new_tokens=max_new_tokens,
+        temperature=0,
+        top_k=50,
+        eos_token_id=tokenizer.eos_token_id,
+        profiling=False,
+    )
+    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
+    print(f"Response: {response}")
 ```