| from transformers import AutoTokenizer, pipeline | |
| from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig | |
| import torch | |
| model_id = "MaziyarPanahi/Smaug-72B-v0.1-GPTQ" | |
| quantize_config = BaseQuantizeConfig( | |
| bits=4, | |
| group_size=128, | |
| desc_act=False | |
| ) | |
| model = AutoGPTQForCausalLM.from_quantized( | |
| model_id, | |
| use_safetensors=True, | |
| device="cuda:0", | |
| quantize_config=quantize_config) | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| pipe = pipeline( | |
| "text-generation", | |
| model=model, | |
| tokenizer=tokenizer, | |
| max_new_tokens=512, | |
| temperature=0.7, | |
| top_p=0.95, | |
| repetition_penalty=1.1 | |
| ) | |
| outputs = pipe("What is a large language model?") | |
| print(outputs[0]["generated_text"]) | |