udituen commited on
Commit
0f68754
·
verified ·
1 Parent(s): ee4e258

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +10 -2
src/streamlit_app.py CHANGED
@@ -67,11 +67,19 @@ def load_retriever():
67
  def load_llm():
68
  # pipe = pipeline("text-generation", model="google/flan-t5-small", max_new_tokens=256)
69
  # load the tokenizer and model on cpu/gpu
70
- quantization_config = BitsAndBytesConfig(load_in_8bit=True,llm_int8_enable_fp32_cpu_offload=True)
 
 
 
 
 
 
 
 
71
  model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
72
  # model_name = "meta-llama/Llama-2-7b-chat-hf"
73
  tokenizer = AutoTokenizer.from_pretrained(model_name)
74
- model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", low_cpu_mem_usage=True)
75
  pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256)
76
 
77
  return HuggingFacePipeline(pipeline=pipe)
 
67
  def load_llm():
68
  # pipe = pipeline("text-generation", model="google/flan-t5-small", max_new_tokens=256)
69
  # load the tokenizer and model on cpu/gpu
70
+ # quantization_config = BitsAndBytesConfig(load_in_8bit=True,llm_int8_enable_fp32_cpu_offload=True)
71
+
72
+ quantization_config = BitsAndBytesConfig(
73
+ load_in_4bit=True,
74
+ bnb_4bit_compute_dtype=torch.float16,
75
+ bnb_4bit_quant_type="nf4",
76
+ bnb_4bit_use_double_quant=True
77
+ )
78
+
79
  model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
80
  # model_name = "meta-llama/Llama-2-7b-chat-hf"
81
  tokenizer = AutoTokenizer.from_pretrained(model_name)
82
+ model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", quantization_config=quantization_config, low_cpu_mem_usage=True)
83
  pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256)
84
 
85
  return HuggingFacePipeline(pipeline=pipe)