Spaces:

optiviseapp
/

fnmodel

Paused

aeb56 commited on Nov 10

Commit

1a04e17

1 Parent(s): 0dc6b10

Fix 8-bit quantization CPU offload for large models

Files changed (1) hide show

app.py CHANGED Viewed

@@ -129,6 +129,9 @@ class ModelMerger:
                 if use_8bit:
                     # Use 8-bit quantization for tighter memory constraints
                     load_kwargs["load_in_8bit"] = True
                 else:
                     # Use bfloat16 for best quality when memory allows
                     load_kwargs["torch_dtype"] = torch.bfloat16

                 if use_8bit:
                     # Use 8-bit quantization for tighter memory constraints
                     load_kwargs["load_in_8bit"] = True
+                    load_kwargs["llm_int8_enable_fp32_cpu_offload"] = True
+                    load_kwargs["llm_int8_threshold"] = 6.0
+                    logger.info("Enabling CPU offload for 8-bit quantization")
                 else:
                     # Use bfloat16 for best quality when memory allows
                     load_kwargs["torch_dtype"] = torch.bfloat16