Wenye He commited on
Commit
0ffa0b9
·
verified ·
1 Parent(s): 2c277a8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -3
app.py CHANGED
@@ -17,7 +17,7 @@ MODEL_CONFIG = {
17
  }
18
  }
19
 
20
- # Quantization config for 4-bit loading
21
  bnb_config = BitsAndBytesConfig(
22
  load_in_4bit=True,
23
  bnb_4bit_quant_type="nf4",
@@ -39,9 +39,9 @@ class ChatModel:
39
 
40
  model = AutoModelForCausalLM.from_pretrained(
41
  config["model_name"],
 
42
  device_map="auto",
43
  torch_dtype=torch.float16,
44
- attn_implementation="flash_attention_2" if "phi-3" in model_name else "eager",
45
  low_cpu_mem_usage=True
46
  )
47
 
@@ -60,7 +60,7 @@ class ChatModel:
60
  "text-generation",
61
  model=self.models[model_name],
62
  tokenizer=self.tokenizers[model_name],
63
- max_new_tokens=512,
64
  temperature=0.7,
65
  top_p=0.9,
66
  repetition_penalty=1.1,
 
17
  }
18
  }
19
 
20
+ # Quantization config (4-bit)
21
  bnb_config = BitsAndBytesConfig(
22
  load_in_4bit=True,
23
  bnb_4bit_quant_type="nf4",
 
39
 
40
  model = AutoModelForCausalLM.from_pretrained(
41
  config["model_name"],
42
+ quantization_config=bnb_config,
43
  device_map="auto",
44
  torch_dtype=torch.float16,
 
45
  low_cpu_mem_usage=True
46
  )
47
 
 
60
  "text-generation",
61
  model=self.models[model_name],
62
  tokenizer=self.tokenizers[model_name],
63
+ max_new_tokens=384,
64
  temperature=0.7,
65
  top_p=0.9,
66
  repetition_penalty=1.1,