olacode55 commited on
Commit
42a1704
·
verified ·
1 Parent(s): e54b246

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -2
app.py CHANGED
@@ -17,14 +17,20 @@ adapter_model = "olacode55/zimble-llama2"
17
 
18
  tokenizer = AutoTokenizer.from_pretrained(base_model)
19
 
 
 
 
 
20
  base = AutoModelForCausalLM.from_pretrained(
21
  base_model,
22
  load_in_8bit=True,
23
  device_map="auto",
24
- llm_int8_enable_fp32_cpu_offload=True, # offload layers to CPU if GPU full
25
- use_auth_token="hf_" +hf_token
 
26
  )
27
 
 
28
  model = PeftModel.from_pretrained(base, adapter_model)
29
 
30
  # === STEP 3: Define generation function ===
 
17
 
18
  tokenizer = AutoTokenizer.from_pretrained(base_model)
19
 
20
+ offload_folder = "./offload" # must exist or be creatable
21
+ os.makedirs(offload_folder, exist_ok=True)
22
+
23
+ # --- Load model with 8-bit quantization and CPU offload ---
24
  base = AutoModelForCausalLM.from_pretrained(
25
  base_model,
26
  load_in_8bit=True,
27
  device_map="auto",
28
+ offload_folder=offload_folder,
29
+ llm_int8_enable_fp32_cpu_offload=True,
30
+ use_auth_token="hf_" +hf_token
31
  )
32
 
33
+
34
  model = PeftModel.from_pretrained(base, adapter_model)
35
 
36
  # === STEP 3: Define generation function ===