genai2eliza commited on
Commit
cda4bbd
·
verified ·
1 Parent(s): 161ba5b

Upload Llama-3.1-8B quantized with ModelOpt FP8

Browse files
Files changed (1) hide show
  1. README.md +3 -9
README.md CHANGED
@@ -26,23 +26,17 @@ This is a quantized version of [meta-llama/Llama-3.1-8B-Instruct](https://huggin
26
  ```python
27
  import torch
28
  from transformers import AutoModelForCausalLM, AutoTokenizer
29
- from quanto import quantize, freeze, qint8, safe_load
30
 
31
  # Load base model structure
32
  model = AutoModelForCausalLM.from_pretrained(
33
- "meta-llama/Llama-3.1-8B-Instruct",
34
  torch_dtype=torch.bfloat16,
35
  low_cpu_mem_usage=True
36
  )
37
 
38
- # Quantize structure and load weights
39
- quantize(model, weights=qint8)
40
- state_dict = safe_load("model.safetensors") # Use quanto's safe_load
41
- model.load_state_dict(state_dict)
42
- freeze(model)
43
-
44
  # Load tokenizer and generate
45
- tokenizer = AutoTokenizer.from_pretrained("tokenlabsdotrun/Llama-3.1-8B-Quanto-Int8")
 
46
  inputs = tokenizer("Hello, my name is", return_tensors="pt")
47
  outputs = model.generate(**inputs, max_new_tokens=10)
48
  print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 
26
  ```python
27
  import torch
28
  from transformers import AutoModelForCausalLM, AutoTokenizer
 
29
 
30
  # Load base model structure
31
  model = AutoModelForCausalLM.from_pretrained(
32
+ "tokenlabsdotrun/Llama-3.1-8B-ModelOpt-FP8",
33
  torch_dtype=torch.bfloat16,
34
  low_cpu_mem_usage=True
35
  )
36
 
 
 
 
 
 
 
37
  # Load tokenizer and generate
38
+ tokenizer = AutoTokenizer.from_pretrained("tokenlabsdotrun/Llama-3.1-8B-ModelOpt-FP8")
39
+
40
  inputs = tokenizer("Hello, my name is", return_tensors="pt")
41
  outputs = model.generate(**inputs, max_new_tokens=10)
42
  print(tokenizer.decode(outputs[0], skip_special_tokens=True))