Spaces:
Paused
Paused
aeb56
commited on
Commit
·
1a04e17
1
Parent(s):
0dc6b10
Fix 8-bit quantization CPU offload for large models
Browse files
app.py
CHANGED
|
@@ -129,6 +129,9 @@ class ModelMerger:
|
|
| 129 |
if use_8bit:
|
| 130 |
# Use 8-bit quantization for tighter memory constraints
|
| 131 |
load_kwargs["load_in_8bit"] = True
|
|
|
|
|
|
|
|
|
|
| 132 |
else:
|
| 133 |
# Use bfloat16 for best quality when memory allows
|
| 134 |
load_kwargs["torch_dtype"] = torch.bfloat16
|
|
|
|
| 129 |
if use_8bit:
|
| 130 |
# Use 8-bit quantization for tighter memory constraints
|
| 131 |
load_kwargs["load_in_8bit"] = True
|
| 132 |
+
load_kwargs["llm_int8_enable_fp32_cpu_offload"] = True
|
| 133 |
+
load_kwargs["llm_int8_threshold"] = 6.0
|
| 134 |
+
logger.info("Enabling CPU offload for 8-bit quantization")
|
| 135 |
else:
|
| 136 |
# Use bfloat16 for best quality when memory allows
|
| 137 |
load_kwargs["torch_dtype"] = torch.bfloat16
|