Tonic
commited on
reduce memory footprint bfloat16
Browse files
app.py
CHANGED
|
@@ -24,12 +24,19 @@ quantization_config = BitsAndBytesConfig(
|
|
| 24 |
bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation to save memory
|
| 25 |
)
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
# Load tokenizer and model
|
| 28 |
tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
|
| 29 |
model = AutoModelForCausalLM.from_pretrained(
|
| 30 |
model_id,
|
| 31 |
quantization_config=quantization_config, # Apply quantization
|
| 32 |
# device_map="auto", # Automatically map to available devices
|
|
|
|
| 33 |
torch_dtype=torch.bfloat16,
|
| 34 |
token=HF_TOKEN,
|
| 35 |
max_position_embeddings=4096 # Reduce context window to 8k tokens (from 128k)
|
|
|
|
| 24 |
bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation to save memory
|
| 25 |
)
|
| 26 |
|
| 27 |
+
# Custom device map to offload non-critical components
|
| 28 |
+
custom_device_map = {
|
| 29 |
+
"transformer": "cuda", # Keep transformer layers on GPU
|
| 30 |
+
"lm_head": "cpu", # Offload language model head to CPU
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
# Load tokenizer and model
|
| 34 |
tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
|
| 35 |
model = AutoModelForCausalLM.from_pretrained(
|
| 36 |
model_id,
|
| 37 |
quantization_config=quantization_config, # Apply quantization
|
| 38 |
# device_map="auto", # Automatically map to available devices
|
| 39 |
+
device_map=custom_device_map, # Use custom device map
|
| 40 |
torch_dtype=torch.bfloat16,
|
| 41 |
token=HF_TOKEN,
|
| 42 |
max_position_embeddings=4096 # Reduce context window to 8k tokens (from 128k)
|