Update app.py
Browse files
app.py
CHANGED
|
@@ -17,14 +17,20 @@ adapter_model = "olacode55/zimble-llama2"
|
|
| 17 |
|
| 18 |
tokenizer = AutoTokenizer.from_pretrained(base_model)
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
base = AutoModelForCausalLM.from_pretrained(
|
| 21 |
base_model,
|
| 22 |
load_in_8bit=True,
|
| 23 |
device_map="auto",
|
| 24 |
-
|
| 25 |
-
|
|
|
|
| 26 |
)
|
| 27 |
|
|
|
|
| 28 |
model = PeftModel.from_pretrained(base, adapter_model)
|
| 29 |
|
| 30 |
# === STEP 3: Define generation function ===
|
|
|
|
| 17 |
|
| 18 |
tokenizer = AutoTokenizer.from_pretrained(base_model)
|
| 19 |
|
| 20 |
+
offload_folder = "./offload" # must exist or be creatable
|
| 21 |
+
os.makedirs(offload_folder, exist_ok=True)
|
| 22 |
+
|
| 23 |
+
# --- Load model with 8-bit quantization and CPU offload ---
|
| 24 |
base = AutoModelForCausalLM.from_pretrained(
|
| 25 |
base_model,
|
| 26 |
load_in_8bit=True,
|
| 27 |
device_map="auto",
|
| 28 |
+
offload_folder=offload_folder,
|
| 29 |
+
llm_int8_enable_fp32_cpu_offload=True,
|
| 30 |
+
use_auth_token="hf_" +hf_token
|
| 31 |
)
|
| 32 |
|
| 33 |
+
|
| 34 |
model = PeftModel.from_pretrained(base, adapter_model)
|
| 35 |
|
| 36 |
# === STEP 3: Define generation function ===
|