Upload Llama-3.1-8B quantized with ModelOpt FP8
Browse files
README.md
CHANGED
|
@@ -26,23 +26,17 @@ This is a quantized version of [meta-llama/Llama-3.1-8B-Instruct](https://huggin
|
|
| 26 |
```python
|
| 27 |
import torch
|
| 28 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 29 |
-
from quanto import quantize, freeze, qint8, safe_load
|
| 30 |
|
| 31 |
# Load base model structure
|
| 32 |
model = AutoModelForCausalLM.from_pretrained(
|
| 33 |
-
"
|
| 34 |
torch_dtype=torch.bfloat16,
|
| 35 |
low_cpu_mem_usage=True
|
| 36 |
)
|
| 37 |
|
| 38 |
-
# Quantize structure and load weights
|
| 39 |
-
quantize(model, weights=qint8)
|
| 40 |
-
state_dict = safe_load("model.safetensors") # Use quanto's safe_load
|
| 41 |
-
model.load_state_dict(state_dict)
|
| 42 |
-
freeze(model)
|
| 43 |
-
|
| 44 |
# Load tokenizer and generate
|
| 45 |
-
tokenizer = AutoTokenizer.from_pretrained("tokenlabsdotrun/Llama-3.1-8B-
|
|
|
|
| 46 |
inputs = tokenizer("Hello, my name is", return_tensors="pt")
|
| 47 |
outputs = model.generate(**inputs, max_new_tokens=10)
|
| 48 |
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
|
|
|
| 26 |
```python
|
| 27 |
import torch
|
| 28 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
|
| 29 |
|
| 30 |
# Load base model structure
|
| 31 |
model = AutoModelForCausalLM.from_pretrained(
|
| 32 |
+
"tokenlabsdotrun/Llama-3.1-8B-ModelOpt-FP8",
|
| 33 |
torch_dtype=torch.bfloat16,
|
| 34 |
low_cpu_mem_usage=True
|
| 35 |
)
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
# Load tokenizer and generate
|
| 38 |
+
tokenizer = AutoTokenizer.from_pretrained("tokenlabsdotrun/Llama-3.1-8B-ModelOpt-FP8")
|
| 39 |
+
|
| 40 |
inputs = tokenizer("Hello, my name is", return_tensors="pt")
|
| 41 |
outputs = model.generate(**inputs, max_new_tokens=10)
|
| 42 |
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|