BoostedJonP commited on
Commit
25ab7b8
·
1 Parent(s): 1748543

added quantization for performance

Browse files
Files changed (1) hide show
  1. app.py +12 -2
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import gradio as gr
2
  import torch
3
- from transformers import AutoTokenizer, AutoModelForCausalLM
4
  from functools import lru_cache
5
  import logging
6
 
@@ -12,6 +12,16 @@ logging.basicConfig(level=logging.INFO)
12
  logger.info("Starting Jerome Powell AI Assistant...")
13
 
14
 
 
 
 
 
 
 
 
 
 
 
15
  @lru_cache(maxsize=1)
16
  def load_model():
17
  """Load the fine-tuned Jerome Powell model"""
@@ -26,7 +36,7 @@ def load_model():
26
  model = AutoModelForCausalLM.from_pretrained(
27
  MODEL_NAME,
28
  trust_remote_code=True,
29
- torch_dtype=torch.float16,
30
  device_map="auto",
31
  attn_implementation="eager",
32
  use_cache=True,
 
1
  import gradio as gr
2
  import torch
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
4
  from functools import lru_cache
5
  import logging
6
 
 
12
  logger.info("Starting Jerome Powell AI Assistant...")
13
 
14
 
15
+ QUANTIZATION_CONFIG = BitsAndBytesConfig(
16
+ load_in_4bit=True,
17
+ bnb_4bit_quant_type="nf4",
18
+ bnb_4bit_use_double_quant=True,
19
+ bnb_4bit_compute_dtype="float16",
20
+ )
21
+
22
+ MODEL_NAME = "BoostedJonP/powell-phi3-mini"
23
+
24
+
25
  @lru_cache(maxsize=1)
26
  def load_model():
27
  """Load the fine-tuned Jerome Powell model"""
 
36
  model = AutoModelForCausalLM.from_pretrained(
37
  MODEL_NAME,
38
  trust_remote_code=True,
39
+ quantization_config=QUANTIZATION_CONFIG,
40
  device_map="auto",
41
  attn_implementation="eager",
42
  use_cache=True,