Spaces:
Sleeping
Sleeping
Commit
·
f2c56b3
1
Parent(s):
6cbf469
preserve quantization config
Browse files
app.py
CHANGED
|
@@ -44,9 +44,20 @@ def load_model():
|
|
| 44 |
)
|
| 45 |
else:
|
| 46 |
logger.info("CUDA not available, loading with CPU optimizations")
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
model = AutoModelForCausalLM.from_pretrained(
|
| 51 |
MODEL_NAME,
|
| 52 |
config=config,
|
|
@@ -264,7 +275,7 @@ def create_interface():
|
|
| 264 |
num_beams = gr.Slider(
|
| 265 |
minimum=1,
|
| 266 |
maximum=8,
|
| 267 |
-
value=
|
| 268 |
step=1,
|
| 269 |
label="Number of Beams",
|
| 270 |
info="Higher values = better quality but slower generation (beam search)",
|
|
|
|
| 44 |
)
|
| 45 |
else:
|
| 46 |
logger.info("CUDA not available, loading with CPU optimizations")
|
| 47 |
+
quant_config = getattr(config, "quantization_config", None)
|
| 48 |
+
if quant_config is not None:
|
| 49 |
+
logger.info("Adjusting quantization settings for CPU execution")
|
| 50 |
+
if isinstance(quant_config, dict):
|
| 51 |
+
quant_config.pop("load_in_4bit", None)
|
| 52 |
+
quant_config.pop("load_in_8bit", None)
|
| 53 |
+
else:
|
| 54 |
+
if hasattr(quant_config, "load_in_4bit"):
|
| 55 |
+
quant_config.load_in_4bit = False
|
| 56 |
+
if hasattr(quant_config, "load_in_8bit"):
|
| 57 |
+
quant_config.load_in_8bit = False
|
| 58 |
+
if hasattr(quant_config, "llm_int8_enable_fp32_cpu_offload"):
|
| 59 |
+
quant_config.llm_int8_enable_fp32_cpu_offload = False
|
| 60 |
+
config.quantization_config = quant_config
|
| 61 |
model = AutoModelForCausalLM.from_pretrained(
|
| 62 |
MODEL_NAME,
|
| 63 |
config=config,
|
|
|
|
| 275 |
num_beams = gr.Slider(
|
| 276 |
minimum=1,
|
| 277 |
maximum=8,
|
| 278 |
+
value=1,
|
| 279 |
step=1,
|
| 280 |
label="Number of Beams",
|
| 281 |
info="Higher values = better quality but slower generation (beam search)",
|