webbigdata
/

C3TR-Adapter_gptq

text-generation-inference

4-bit precision

Model card Files Files and versions

dahara1 commited on May 21, 2024

Commit

ca9fb88

·

verified ·

1 Parent(s): 5f482cd

Update README.md

Files changed (1) hide show

README.md +4 -3

README.md CHANGED Viewed

@@ -30,9 +30,8 @@ pip install -vvv --no-build-isolation -e .
 ### Sample code
 ```
-from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
-from optimum.gptq import GPTQQuantizer, load_quantized_model
 import torch
 model_name = "webbigdata/C3TR-Adapter_gptq"
 # thanks to tk-master
@@ -41,9 +40,11 @@ config = AutoConfig.from_pretrained(model_name)
 config.quantization_config["use_exllama"] = False
 config.quantization_config["exllama_config"] = {"version":2}
 max_memory={0: "12GiB", "cpu": "10GiB"}
 quantized_model = AutoModelForCausalLM.from_pretrained(model_name
-        , torch_dtype=torch.bfloat16  # chage float16 if you use free colab or something not support bfloat16.
         , device_map="auto", max_memory=max_memory
         , config=config)
 tokenizer = AutoTokenizer.from_pretrained(model_name)

 ### Sample code
 ```
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
 model_name = "webbigdata/C3TR-Adapter_gptq"
 # thanks to tk-master
 config.quantization_config["use_exllama"] = False
 config.quantization_config["exllama_config"] = {"version":2}
+# adust your gpu memory size. 0 means first gpu.
 max_memory={0: "12GiB", "cpu": "10GiB"}
 quantized_model = AutoModelForCausalLM.from_pretrained(model_name
+        , torch_dtype=torch.bfloat16  # change torch.float16 if you use free colab or something not support bfloat16.
         , device_map="auto", max_memory=max_memory
         , config=config)
 tokenizer = AutoTokenizer.from_pretrained(model_name)