BoostedJonP commited on
Commit
f2c56b3
·
1 Parent(s): 6cbf469

preserve quantization config

Browse files
Files changed (1) hide show
  1. app.py +15 -4
app.py CHANGED
@@ -44,9 +44,20 @@ def load_model():
44
  )
45
  else:
46
  logger.info("CUDA not available, loading with CPU optimizations")
47
- if getattr(config, "quantization_config", None) is not None:
48
- logger.info("Disabling quantization settings for CPU execution")
49
- config.quantization_config = None
 
 
 
 
 
 
 
 
 
 
 
50
  model = AutoModelForCausalLM.from_pretrained(
51
  MODEL_NAME,
52
  config=config,
@@ -264,7 +275,7 @@ def create_interface():
264
  num_beams = gr.Slider(
265
  minimum=1,
266
  maximum=8,
267
- value=3,
268
  step=1,
269
  label="Number of Beams",
270
  info="Higher values = better quality but slower generation (beam search)",
 
44
  )
45
  else:
46
  logger.info("CUDA not available, loading with CPU optimizations")
47
+ quant_config = getattr(config, "quantization_config", None)
48
+ if quant_config is not None:
49
+ logger.info("Adjusting quantization settings for CPU execution")
50
+ if isinstance(quant_config, dict):
51
+ quant_config.pop("load_in_4bit", None)
52
+ quant_config.pop("load_in_8bit", None)
53
+ else:
54
+ if hasattr(quant_config, "load_in_4bit"):
55
+ quant_config.load_in_4bit = False
56
+ if hasattr(quant_config, "load_in_8bit"):
57
+ quant_config.load_in_8bit = False
58
+ if hasattr(quant_config, "llm_int8_enable_fp32_cpu_offload"):
59
+ quant_config.llm_int8_enable_fp32_cpu_offload = False
60
+ config.quantization_config = quant_config
61
  model = AutoModelForCausalLM.from_pretrained(
62
  MODEL_NAME,
63
  config=config,
 
275
  num_beams = gr.Slider(
276
  minimum=1,
277
  maximum=8,
278
+ value=1,
279
  step=1,
280
  label="Number of Beams",
281
  info="Higher values = better quality but slower generation (beam search)",