Spaces:

Tonic
/

Command-A

Runtime error

Tonic commited on Mar 13, 2025

Commit

ac9fe9d

unverified ·

1 Parent(s): 1584225

reduce memory footprint bfloat16

Files changed (1) hide show

app.py CHANGED Viewed

@@ -24,12 +24,19 @@ quantization_config = BitsAndBytesConfig(
     bnb_4bit_compute_dtype=torch.bfloat16  # Use bfloat16 for computation to save memory
 )
 # Load tokenizer and model
 tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     quantization_config=quantization_config,  # Apply quantization
     # device_map="auto",  # Automatically map to available devices
     torch_dtype=torch.bfloat16,
     token=HF_TOKEN,
     max_position_embeddings=4096  # Reduce context window to 8k tokens (from 128k)

     bnb_4bit_compute_dtype=torch.bfloat16  # Use bfloat16 for computation to save memory
 )
+# Custom device map to offload non-critical components
+custom_device_map = {
+    "transformer": "cuda",  # Keep transformer layers on GPU
+    "lm_head": "cpu",       # Offload language model head to CPU
+}
 # Load tokenizer and model
 tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     quantization_config=quantization_config,  # Apply quantization
     # device_map="auto",  # Automatically map to available devices
+    device_map=custom_device_map,             # Use custom device map
     torch_dtype=torch.bfloat16,
     token=HF_TOKEN,
     max_position_embeddings=4096  # Reduce context window to 8k tokens (from 128k)