aeb56 commited on
Commit
1a04e17
·
1 Parent(s): 0dc6b10

Fix 8-bit quantization CPU offload for large models

Browse files
Files changed (1) hide show
  1. app.py +3 -0
app.py CHANGED
@@ -129,6 +129,9 @@ class ModelMerger:
129
  if use_8bit:
130
  # Use 8-bit quantization for tighter memory constraints
131
  load_kwargs["load_in_8bit"] = True
 
 
 
132
  else:
133
  # Use bfloat16 for best quality when memory allows
134
  load_kwargs["torch_dtype"] = torch.bfloat16
 
129
  if use_8bit:
130
  # Use 8-bit quantization for tighter memory constraints
131
  load_kwargs["load_in_8bit"] = True
132
+ load_kwargs["llm_int8_enable_fp32_cpu_offload"] = True
133
+ load_kwargs["llm_int8_threshold"] = 6.0
134
+ logger.info("Enabling CPU offload for 8-bit quantization")
135
  else:
136
  # Use bfloat16 for best quality when memory allows
137
  load_kwargs["torch_dtype"] = torch.bfloat16