Spaces:
Sleeping
Sleeping
| # optimized_loading.py | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| import torch | |
| from accelerate import infer_auto_device_map | |
| # 4-bit quantization (reduces memory by 75%) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| "TinyLlama/TinyLlama-1.1B-Chat-v1.0", | |
| load_in_4bit=True, # or load_in_8bit=True for 8-bit | |
| device_map="auto", | |
| torch_dtype=torch.float16, | |
| ) | |
| # CPU offloading (for low RAM) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| "TinyLlama/TinyLlama-1.1B-Chat-v1.0", | |
| device_map="auto", | |
| offload_folder="offload", | |
| offload_state_dict=True, | |
| ) |