kofdai commited on
Commit
fa66b2b
·
1 Parent(s): 90a8b3a

Add 8-bit quantization to reduce memory usage

Browse files
Files changed (2) hide show
  1. app.py +6 -6
  2. requirements.txt +1 -0
app.py CHANGED
@@ -15,20 +15,20 @@ def load_model():
15
  global model, tokenizer, device
16
  if model is not None:
17
  return
18
- print(f"Loading {DEFAULT_MODEL}...")
19
  device = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu")
20
  print(f"Using device: {device}")
21
  tokenizer = AutoTokenizer.from_pretrained(DEFAULT_MODEL, trust_remote_code=True)
 
 
22
  model = AutoModelForCausalLM.from_pretrained(
23
  DEFAULT_MODEL,
24
- torch_dtype=torch.float16 if device == "cuda" else torch.float32,
25
- device_map="auto" if device == "cuda" else None,
26
  trust_remote_code=True
27
  )
28
- if device != "cuda":
29
- model = model.to(device)
30
  model.eval()
31
- print("Model loaded!")
32
 
33
  def get_prompt(domain, question):
34
  domains = {
 
15
  global model, tokenizer, device
16
  if model is not None:
17
  return
18
+ print(f"Loading {DEFAULT_MODEL} with 8-bit quantization...")
19
  device = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu")
20
  print(f"Using device: {device}")
21
  tokenizer = AutoTokenizer.from_pretrained(DEFAULT_MODEL, trust_remote_code=True)
22
+
23
+ # Use 8-bit quantization to reduce memory usage
24
  model = AutoModelForCausalLM.from_pretrained(
25
  DEFAULT_MODEL,
26
+ load_in_8bit=True,
27
+ device_map="auto",
28
  trust_remote_code=True
29
  )
 
 
30
  model.eval()
31
+ print("Model loaded with 8-bit quantization!")
32
 
33
  def get_prompt(domain, question):
34
  domains = {
requirements.txt CHANGED
@@ -5,3 +5,4 @@ accelerate>=0.20.0
5
  huggingface_hub>=0.20.0
6
  sentencepiece>=0.1.99
7
  protobuf>=3.20.0
 
 
5
  huggingface_hub>=0.20.0
6
  sentencepiece>=0.1.99
7
  protobuf>=3.20.0
8
+ bitsandbytes>=0.41.0