Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -6,11 +6,23 @@ from typing import List, Dict, Any, Optional
|
|
| 6 |
from functools import lru_cache
|
| 7 |
from xml.etree import ElementTree as ET
|
| 8 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
|
|
| 9 |
try:
|
| 10 |
-
from transformers import BitsAndBytesConfig
|
| 11 |
-
except
|
| 12 |
BitsAndBytesConfig = None
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
import numpy as np
|
| 16 |
import requests
|
|
@@ -205,7 +217,9 @@ if ADAPTER_REPO:
|
|
| 205 |
dlog("LLM", f"Loading base model: {BASE_MODEL}")
|
| 206 |
tokenizer_lm = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=False)
|
| 207 |
|
| 208 |
-
|
|
|
|
|
|
|
| 209 |
bnb_config = BitsAndBytesConfig(
|
| 210 |
load_in_8bit=(QUANTIZE == "8bit"),
|
| 211 |
load_in_4bit=(QUANTIZE == "4bit"),
|
|
@@ -219,7 +233,7 @@ if QUANTIZE in {"8bit", "4bit"} and BitsAndBytesConfig is not None:
|
|
| 219 |
quantization_config=bnb_config,
|
| 220 |
)
|
| 221 |
else:
|
| 222 |
-
#
|
| 223 |
base_model = AutoModelForCausalLM.from_pretrained(
|
| 224 |
BASE_MODEL,
|
| 225 |
torch_dtype=dtype,
|
|
@@ -228,6 +242,7 @@ else:
|
|
| 228 |
|
| 229 |
|
| 230 |
|
|
|
|
| 231 |
dlog("LLM", f"Loading LoRA adapter from: {ADAPTER_PATH}")
|
| 232 |
model_lm = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
|
| 233 |
model_lm.eval()
|
|
|
|
| 6 |
from functools import lru_cache
|
| 7 |
from xml.etree import ElementTree as ET
|
| 8 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 9 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 10 |
try:
|
| 11 |
+
from transformers import BitsAndBytesConfig # exists even if bitsandbytes isn't installed
|
| 12 |
+
except Exception:
|
| 13 |
BitsAndBytesConfig = None
|
| 14 |
|
| 15 |
+
# Normalize QUANTIZE env
|
| 16 |
+
QUANTIZE = os.environ.get("QUANTIZE", "none").strip().lower()
|
| 17 |
+
|
| 18 |
+
# Detect bitsandbytes presence
|
| 19 |
+
try:
|
| 20 |
+
import bitsandbytes as _bnb # noqa: F401
|
| 21 |
+
_BNB_AVAILABLE = True
|
| 22 |
+
except Exception:
|
| 23 |
+
_BNB_AVAILABLE = False
|
| 24 |
+
|
| 25 |
+
|
| 26 |
|
| 27 |
import numpy as np
|
| 28 |
import requests
|
|
|
|
| 217 |
dlog("LLM", f"Loading base model: {BASE_MODEL}")
|
| 218 |
tokenizer_lm = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=False)
|
| 219 |
|
| 220 |
+
use_bnb = QUANTIZE in {"8bit", "4bit"} and BitsAndBytesConfig is not None and _BNB_AVAILABLE
|
| 221 |
+
|
| 222 |
+
if use_bnb:
|
| 223 |
bnb_config = BitsAndBytesConfig(
|
| 224 |
load_in_8bit=(QUANTIZE == "8bit"),
|
| 225 |
load_in_4bit=(QUANTIZE == "4bit"),
|
|
|
|
| 233 |
quantization_config=bnb_config,
|
| 234 |
)
|
| 235 |
else:
|
| 236 |
+
# Default / fallback: fp16 (no bitsandbytes required)
|
| 237 |
base_model = AutoModelForCausalLM.from_pretrained(
|
| 238 |
BASE_MODEL,
|
| 239 |
torch_dtype=dtype,
|
|
|
|
| 242 |
|
| 243 |
|
| 244 |
|
| 245 |
+
|
| 246 |
dlog("LLM", f"Loading LoRA adapter from: {ADAPTER_PATH}")
|
| 247 |
model_lm = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
|
| 248 |
model_lm.eval()
|