Update src/streamlit_app.py
Browse files- src/streamlit_app.py +3 -3
src/streamlit_app.py
CHANGED
|
@@ -10,7 +10,7 @@ from langchain.chains.combine_documents import create_stuff_documents_chain
|
|
| 10 |
from langchain_community.llms import Ollama
|
| 11 |
import os
|
| 12 |
import torch
|
| 13 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
| 14 |
from huggingface_hub import InferenceClient
|
| 15 |
import re
|
| 16 |
|
|
@@ -67,11 +67,11 @@ def load_retriever():
|
|
| 67 |
def load_llm():
|
| 68 |
# pipe = pipeline("text-generation", model="google/flan-t5-small", max_new_tokens=256)
|
| 69 |
# load the tokenizer and model on cpu/gpu
|
| 70 |
-
|
| 71 |
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
|
| 72 |
# model_name = "meta-llama/Llama-2-7b-chat-hf"
|
| 73 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 74 |
-
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto",
|
| 75 |
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256)
|
| 76 |
|
| 77 |
return HuggingFacePipeline(pipeline=pipe)
|
|
|
|
| 10 |
from langchain_community.llms import Ollama
|
| 11 |
import os
|
| 12 |
import torch
|
| 13 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
|
| 14 |
from huggingface_hub import InferenceClient
|
| 15 |
import re
|
| 16 |
|
|
|
|
| 67 |
def load_llm():
|
| 68 |
# pipe = pipeline("text-generation", model="google/flan-t5-small", max_new_tokens=256)
|
| 69 |
# load the tokenizer and model on cpu/gpu
|
| 70 |
+
quantization_config = BitsAndBytesConfig(load_in_8bit=True,llm_int8_enable_fp32_cpu_offload=True)
|
| 71 |
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
|
| 72 |
# model_name = "meta-llama/Llama-2-7b-chat-hf"
|
| 73 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 74 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", quantization_config=quantization_config, device_map="auto", low_cpu_mem_usage=True)
|
| 75 |
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256)
|
| 76 |
|
| 77 |
return HuggingFacePipeline(pipeline=pipe)
|