Spaces:
Sleeping
Sleeping
| import torch | |
| from transformers import pipeline, BitsAndBytesConfig | |
| import os | |
| hf_token = os.getenv("LLM_token") | |
| os.environ["HUGGINGFACE_HUB_TOKEN"] = hf_token | |
| bnb_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_use_double_quant=True, | |
| llm_int8_skip_modules=None | |
| ) | |
| def load_model(model_path="meta-llama/Meta-Llama-3-8B-Instruct"): | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| pipe = pipeline( | |
| "text-generation", | |
| model=model_path, | |
| model_kwargs={"torch_dtype": torch.float16} if torch.cuda.is_available() else {}, | |
| # quantization_config=bnb_config, | |
| device=device, | |
| token=hf_token | |
| ) | |
| return pipe | |