Spaces:

ayscript
/

health-atlas

Sleeping

ayscript commited on 18 days ago

Commit

dbe0fb5

verified ·

1 Parent(s): e70e50a

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -17,21 +17,21 @@ MODEL_ID = "NCAIR1/N-ATLaS"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # --- 4-Bit Quantization Config ---
 quantization_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
     bnb_4bit_quant_type="nf4",
     bnb_4bit_use_double_quant=True,
 )
-# --- Model Loading ---
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
-    quantization_config=quantization_config, # Apply the quantization
     device_map="auto",
-    llm_int8_enable_fp32_cpu_offload=True,
-    torch_dtype=torch.float16,
     low_cpu_mem_usage=True
 )

 device = "cuda" if torch.cuda.is_available() else "cpu"
 # --- 4-Bit Quantization Config ---
+# 1. Define the config with the offload flag
 quantization_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
     bnb_4bit_quant_type="nf4",
     bnb_4bit_use_double_quant=True,
+    llm_int8_enable_fp32_cpu_offload=True  # <--- Move it here!
 )
+# 2. Load the model (remove the extra argument from here)
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
+    quantization_config=quantization_config,
     device_map="auto",
     low_cpu_mem_usage=True
 )