Spaces:

kofdai
/

null-ai

Sleeping

kofdai commited on 23 days ago

Commit

fa66b2b

1 Parent(s): 90a8b3a

Add 8-bit quantization to reduce memory usage

Files changed (2) hide show

app.py CHANGED Viewed

@@ -15,20 +15,20 @@ def load_model():
     global model, tokenizer, device
     if model is not None:
         return
-    print(f"Loading {DEFAULT_MODEL}...")
     device = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu")
     print(f"Using device: {device}")
     tokenizer = AutoTokenizer.from_pretrained(DEFAULT_MODEL, trust_remote_code=True)
     model = AutoModelForCausalLM.from_pretrained(
         DEFAULT_MODEL,
-        torch_dtype=torch.float16 if device == "cuda" else torch.float32,
-        device_map="auto" if device == "cuda" else None,
         trust_remote_code=True
     )
-    if device != "cuda":
-        model = model.to(device)
     model.eval()
-    print("Model loaded!")
 def get_prompt(domain, question):
     domains = {

     global model, tokenizer, device
     if model is not None:
         return
+    print(f"Loading {DEFAULT_MODEL} with 8-bit quantization...")
     device = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu")
     print(f"Using device: {device}")
     tokenizer = AutoTokenizer.from_pretrained(DEFAULT_MODEL, trust_remote_code=True)
+    # Use 8-bit quantization to reduce memory usage
     model = AutoModelForCausalLM.from_pretrained(
         DEFAULT_MODEL,
+        load_in_8bit=True,
+        device_map="auto",
         trust_remote_code=True
     )
     model.eval()
+    print("Model loaded with 8-bit quantization!")
 def get_prompt(domain, question):
     domains = {

requirements.txt CHANGED Viewed

@@ -5,3 +5,4 @@ accelerate>=0.20.0
 huggingface_hub>=0.20.0
 sentencepiece>=0.1.99
 protobuf>=3.20.0

 huggingface_hub>=0.20.0
 sentencepiece>=0.1.99
 protobuf>=3.20.0
+bitsandbytes>=0.41.0