royleibov
/

Jamba-v0.1-ZipNN-Compressed

@@ -20,9 +20,9 @@ pip install zipnn
 Then simply add at the beginning of the file
 ```python
-from zipnn import zipnn_hf_patch
-zipnn_hf_patch()
 ```
 And continue as usual. The patch will take care of decompressing the model correctly and safely.
@@ -64,9 +64,9 @@ You can run the model not using the optimized Mamba kernels, but it is **not** r
 ```python
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from zipnn import zipnn_hf_patch
-zipnn_hf_patch()
 model = AutoModelForCausalLM.from_pretrained("royleibov/Jamba-v0.1-ZipNN-Compressed")
 tokenizer = AutoTokenizer.from_pretrained("royleibov/Jamba-v0.1-ZipNN-Compressed")
@@ -89,9 +89,9 @@ Please note that if you're using `transformers<4.40.0`, `trust_remote_code=True`
 ```python
 from transformers import AutoModelForCausalLM
 import torch
-from zipnn import zipnn_hf_patch
-zipnn_hf_patch()
 model = AutoModelForCausalLM.from_pretrained("royleibov/Jamba-v0.1-ZipNN-Compressed",
                                              torch_dtype=torch.bfloat16)    # you can also use torch_dtype=torch.float16
@@ -100,9 +100,9 @@ model = AutoModelForCausalLM.from_pretrained("royleibov/Jamba-v0.1-ZipNN-Compres
 When using half precision, you can enable the [FlashAttention2](https://github.com/Dao-AILab/flash-attention) implementation of the Attention blocks. In order to use it, you also need the model on a CUDA device. Since in this precision the model is to big to fit on a single 80GB GPU, you'll also need to parallelize it using [accelerate](https://huggingface.co/docs/accelerate/index):
 ```python
 from transformers import AutoModelForCausalLM
-from zipnn import zipnn_hf_patch
-zipnn_hf_patch()
 import torch
 model = AutoModelForCausalLM.from_pretrained("royleibov/Jamba-v0.1-ZipNN-Compressed",
@@ -118,9 +118,9 @@ model = AutoModelForCausalLM.from_pretrained("royleibov/Jamba-v0.1-ZipNN-Compres
 ```python
 from transformers import AutoModelForCausalLM, BitsAndBytesConfig
-from zipnn import zipnn_hf_patch
-zipnn_hf_patch()
 quantization_config = BitsAndBytesConfig(load_in_8bit=True,
                                          llm_int8_skip_modules=["mamba"])
@@ -140,9 +140,9 @@ from datasets import load_dataset
 from trl import SFTTrainer, SFTConfig
 from peft import LoraConfig
 from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
-from zipnn import zipnn_hf_patch
-zipnn_hf_patch()
 tokenizer = AutoTokenizer.from_pretrained("royleibov/Jamba-v0.1-ZipNN-Compressed")
 model = AutoModelForCausalLM.from_pretrained("royleibov/Jamba-v0.1-ZipNN-Compressed",

 Then simply add at the beginning of the file
 ```python
+from zipnn import zipnn_hf
+zipnn_hf()
 ```
 And continue as usual. The patch will take care of decompressing the model correctly and safely.
 ```python
 from transformers import AutoModelForCausalLM, AutoTokenizer
+from zipnn import zipnn_hf
+zipnn_hf()
 model = AutoModelForCausalLM.from_pretrained("royleibov/Jamba-v0.1-ZipNN-Compressed")
 tokenizer = AutoTokenizer.from_pretrained("royleibov/Jamba-v0.1-ZipNN-Compressed")
 ```python
 from transformers import AutoModelForCausalLM
 import torch
+from zipnn import zipnn_hf
+zipnn_hf()
 model = AutoModelForCausalLM.from_pretrained("royleibov/Jamba-v0.1-ZipNN-Compressed",
                                              torch_dtype=torch.bfloat16)    # you can also use torch_dtype=torch.float16
 When using half precision, you can enable the [FlashAttention2](https://github.com/Dao-AILab/flash-attention) implementation of the Attention blocks. In order to use it, you also need the model on a CUDA device. Since in this precision the model is to big to fit on a single 80GB GPU, you'll also need to parallelize it using [accelerate](https://huggingface.co/docs/accelerate/index):
 ```python
 from transformers import AutoModelForCausalLM
+from zipnn import zipnn_hf
+zipnn_hf()
 import torch
 model = AutoModelForCausalLM.from_pretrained("royleibov/Jamba-v0.1-ZipNN-Compressed",
 ```python
 from transformers import AutoModelForCausalLM, BitsAndBytesConfig
+from zipnn import zipnn_hf
+zipnn_hf()
 quantization_config = BitsAndBytesConfig(load_in_8bit=True,
                                          llm_int8_skip_modules=["mamba"])
 from trl import SFTTrainer, SFTConfig
 from peft import LoraConfig
 from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
+from zipnn import zipnn_hf
+zipnn_hf()
 tokenizer = AutoTokenizer.from_pretrained("royleibov/Jamba-v0.1-ZipNN-Compressed")
 model = AutoModelForCausalLM.from_pretrained("royleibov/Jamba-v0.1-ZipNN-Compressed",