ayscript commited on
Commit
dbe0fb5
·
verified ·
1 Parent(s): e70e50a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -4
app.py CHANGED
@@ -17,21 +17,21 @@ MODEL_ID = "NCAIR1/N-ATLaS"
17
  device = "cuda" if torch.cuda.is_available() else "cpu"
18
 
19
  # --- 4-Bit Quantization Config ---
 
20
  quantization_config = BitsAndBytesConfig(
21
  load_in_4bit=True,
22
  bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
23
  bnb_4bit_quant_type="nf4",
24
  bnb_4bit_use_double_quant=True,
 
25
  )
26
 
27
- # --- Model Loading ---
28
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
29
  model = AutoModelForCausalLM.from_pretrained(
30
  MODEL_ID,
31
- quantization_config=quantization_config, # Apply the quantization
32
  device_map="auto",
33
- llm_int8_enable_fp32_cpu_offload=True,
34
- torch_dtype=torch.float16,
35
  low_cpu_mem_usage=True
36
  )
37
 
 
17
  device = "cuda" if torch.cuda.is_available() else "cpu"
18
 
19
  # --- 4-Bit Quantization Config ---
20
+ # 1. Define the config with the offload flag
21
  quantization_config = BitsAndBytesConfig(
22
  load_in_4bit=True,
23
  bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
24
  bnb_4bit_quant_type="nf4",
25
  bnb_4bit_use_double_quant=True,
26
+ llm_int8_enable_fp32_cpu_offload=True # <--- Move it here!
27
  )
28
 
29
+ # 2. Load the model (remove the extra argument from here)
30
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
31
  model = AutoModelForCausalLM.from_pretrained(
32
  MODEL_ID,
33
+ quantization_config=quantization_config,
34
  device_map="auto",
 
 
35
  low_cpu_mem_usage=True
36
  )
37