Commit
·
f301d67
1
Parent(s):
8aa3ebb
Upgraded gradio and packages to latest. Switched Ctransformers with Llama.cpp Python
Browse files- Dockerfile +3 -2
- README.md +1 -1
- app.py +105 -20
- chatfuncs/chatfuncs.py +209 -45
- chatfuncs/ingest.py +1 -1
- requirements.txt +8 -5
Dockerfile
CHANGED
|
@@ -13,13 +13,14 @@ USER user
|
|
| 13 |
# Set home to the user's home directory
|
| 14 |
ENV HOME=/home/user \
|
| 15 |
PATH=/home/user/.local/bin:$PATH \
|
| 16 |
-
|
| 17 |
PYTHONUNBUFFERED=1 \
|
| 18 |
GRADIO_ALLOW_FLAGGING=never \
|
| 19 |
GRADIO_NUM_PORTS=1 \
|
| 20 |
GRADIO_SERVER_NAME=0.0.0.0 \
|
| 21 |
GRADIO_THEME=huggingface \
|
| 22 |
-
SYSTEM=spaces
|
|
|
|
| 23 |
|
| 24 |
# Set the working directory to the user's home directory
|
| 25 |
WORKDIR $HOME/app
|
|
|
|
| 13 |
# Set home to the user's home directory
|
| 14 |
ENV HOME=/home/user \
|
| 15 |
PATH=/home/user/.local/bin:$PATH \
|
| 16 |
+
PYTHONPATH=$HOME/app \
|
| 17 |
PYTHONUNBUFFERED=1 \
|
| 18 |
GRADIO_ALLOW_FLAGGING=never \
|
| 19 |
GRADIO_NUM_PORTS=1 \
|
| 20 |
GRADIO_SERVER_NAME=0.0.0.0 \
|
| 21 |
GRADIO_THEME=huggingface \
|
| 22 |
+
SYSTEM=spaces \
|
| 23 |
+
LLAMA_CUBLAS=1
|
| 24 |
|
| 25 |
# Set the working directory to the user's home directory
|
| 26 |
WORKDIR $HOME/app
|
README.md
CHANGED
|
@@ -4,7 +4,7 @@ emoji: 🌍
|
|
| 4 |
colorFrom: yellow
|
| 5 |
colorTo: yellow
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
license: apache-2.0
|
|
|
|
| 4 |
colorFrom: yellow
|
| 5 |
colorTo: yellow
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 4.31.5
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
license: apache-2.0
|
app.py
CHANGED
|
@@ -11,6 +11,12 @@ import pandas as pd
|
|
| 11 |
from transformers import AutoTokenizer
|
| 12 |
from ctransformers import AutoModelForCausalLM
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
PandasDataFrame = Type[pd.DataFrame]
|
| 15 |
|
| 16 |
# Disable cuda devices if necessary
|
|
@@ -38,7 +44,7 @@ def get_faiss_store(faiss_vstore_folder,embeddings):
|
|
| 38 |
with zipfile.ZipFile(faiss_vstore_folder + '/' + faiss_vstore_folder + '.zip', 'r') as zip_ref:
|
| 39 |
zip_ref.extractall(faiss_vstore_folder)
|
| 40 |
|
| 41 |
-
faiss_vstore = FAISS.load_local(folder_path=faiss_vstore_folder, embeddings=embeddings)
|
| 42 |
os.remove(faiss_vstore_folder + "/index.faiss")
|
| 43 |
os.remove(faiss_vstore_folder + "/index.pkl")
|
| 44 |
|
|
@@ -53,6 +59,78 @@ import chatfuncs.chatfuncs as chatf
|
|
| 53 |
chatf.embeddings = load_embeddings(embeddings_name)
|
| 54 |
chatf.vectorstore = get_faiss_store(faiss_vstore_folder="faiss_embedding",embeddings=globals()["embeddings"])
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_device=None):
|
| 57 |
print("Loading model")
|
| 58 |
|
|
@@ -67,26 +145,35 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
|
|
| 67 |
if model_type == "Mistral Open Orca (larger, slow)":
|
| 68 |
if torch_device == "cuda":
|
| 69 |
gpu_config.update_gpu(gpu_layers)
|
|
|
|
| 70 |
else:
|
| 71 |
gpu_config.update_gpu(gpu_layers)
|
| 72 |
cpu_config.update_gpu(gpu_layers)
|
| 73 |
|
| 74 |
-
|
| 75 |
|
| 76 |
print(vars(gpu_config))
|
| 77 |
print(vars(cpu_config))
|
| 78 |
|
| 79 |
try:
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
-
except:
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
model =
|
| 89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
tokenizer = []
|
| 92 |
|
|
@@ -100,14 +187,14 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
|
|
| 100 |
|
| 101 |
if torch_device == "cuda":
|
| 102 |
if "flan" in model_name:
|
| 103 |
-
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto")
|
| 104 |
else:
|
| 105 |
-
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
|
| 106 |
else:
|
| 107 |
if "flan" in model_name:
|
| 108 |
-
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
| 109 |
else:
|
| 110 |
-
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
|
| 111 |
|
| 112 |
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length = chatf.context_length)
|
| 113 |
|
|
@@ -179,7 +266,7 @@ with block:
|
|
| 179 |
#chat_height = 500
|
| 180 |
chatbot = gr.Chatbot(avatar_images=('user.jfif', 'bot.jpg'),bubble_full_width = False, scale = 1) # , height=chat_height
|
| 181 |
with gr.Accordion("Open this tab to see the source paragraphs used to generate the answer", open = False):
|
| 182 |
-
sources = gr.HTML(value = "Source paragraphs with the most relevant text will appear here"
|
| 183 |
|
| 184 |
with gr.Row():
|
| 185 |
message = gr.Textbox(
|
|
@@ -233,7 +320,7 @@ with block:
|
|
| 233 |
|
| 234 |
|
| 235 |
gr.HTML(
|
| 236 |
-
"<center>This app is based on the models Flan Alpaca and Mistral Open Orca. It powered by Gradio, Transformers,
|
| 237 |
)
|
| 238 |
|
| 239 |
examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
|
|
@@ -289,6 +376,4 @@ with block:
|
|
| 289 |
# Thumbs up or thumbs down voting function
|
| 290 |
chatbot.like(chatf.vote, [chat_history_state, instruction_prompt_out, model_type_state], None)
|
| 291 |
|
| 292 |
-
block.queue(
|
| 293 |
-
# -
|
| 294 |
-
|
|
|
|
| 11 |
from transformers import AutoTokenizer
|
| 12 |
from ctransformers import AutoModelForCausalLM
|
| 13 |
|
| 14 |
+
import torch
|
| 15 |
+
|
| 16 |
+
import llama_cpp
|
| 17 |
+
from llama_cpp import Llama
|
| 18 |
+
from huggingface_hub import hf_hub_download
|
| 19 |
+
|
| 20 |
PandasDataFrame = Type[pd.DataFrame]
|
| 21 |
|
| 22 |
# Disable cuda devices if necessary
|
|
|
|
| 44 |
with zipfile.ZipFile(faiss_vstore_folder + '/' + faiss_vstore_folder + '.zip', 'r') as zip_ref:
|
| 45 |
zip_ref.extractall(faiss_vstore_folder)
|
| 46 |
|
| 47 |
+
faiss_vstore = FAISS.load_local(folder_path=faiss_vstore_folder, embeddings=embeddings, allow_dangerous_deserialization=True)
|
| 48 |
os.remove(faiss_vstore_folder + "/index.faiss")
|
| 49 |
os.remove(faiss_vstore_folder + "/index.pkl")
|
| 50 |
|
|
|
|
| 59 |
chatf.embeddings = load_embeddings(embeddings_name)
|
| 60 |
chatf.vectorstore = get_faiss_store(faiss_vstore_folder="faiss_embedding",embeddings=globals()["embeddings"])
|
| 61 |
|
| 62 |
+
# def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_device=None):
|
| 63 |
+
# print("Loading model")
|
| 64 |
+
|
| 65 |
+
# # Default values inside the function
|
| 66 |
+
# if gpu_config is None:
|
| 67 |
+
# gpu_config = chatf.gpu_config
|
| 68 |
+
# if cpu_config is None:
|
| 69 |
+
# cpu_config = chatf.cpu_config
|
| 70 |
+
# if torch_device is None:
|
| 71 |
+
# torch_device = chatf.torch_device
|
| 72 |
+
|
| 73 |
+
# if model_type == "Mistral Open Orca (larger, slow)":
|
| 74 |
+
# if torch_device == "cuda":
|
| 75 |
+
# gpu_config.update_gpu(gpu_layers)
|
| 76 |
+
# else:
|
| 77 |
+
# gpu_config.update_gpu(gpu_layers)
|
| 78 |
+
# cpu_config.update_gpu(gpu_layers)
|
| 79 |
+
|
| 80 |
+
# print("Loading with", cpu_config.gpu_layers, "model layers sent to GPU.")
|
| 81 |
+
|
| 82 |
+
# print(vars(gpu_config))
|
| 83 |
+
# print(vars(cpu_config))
|
| 84 |
+
|
| 85 |
+
# try:
|
| 86 |
+
# #model = AutoModelForCausalLM.from_pretrained('Aryanne/Orca-Mini-3B-gguf', model_type='llama', model_file='q5_0-orca-mini-3b.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
|
| 87 |
+
# #model = AutoModelForCausalLM.from_pretrained('Aryanne/Wizard-Orca-3B-gguf', model_type='llama', model_file='q4_1-wizard-orca-3b.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
|
| 88 |
+
# model = AutoModelForCausalLM.from_pretrained('TheBloke/Mistral-7B-OpenOrca-GGUF', model_type='mistral', model_file='mistral-7b-openorca.Q4_K_M.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
|
| 89 |
+
# #model = AutoModelForCausalLM.from_pretrained('TheBloke/MistralLite-7B-GGUF', model_type='mistral', model_file='mistrallite.Q4_K_M.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
|
| 90 |
+
|
| 91 |
+
# except:
|
| 92 |
+
# #model = AutoModelForCausalLM.from_pretrained('Aryanne/Orca-Mini-3B-gguf', model_type='llama', model_file='q5_0-orca-mini-3b.gguf', **vars(cpu_config)) #**asdict(CtransRunConfig_gpu())
|
| 93 |
+
# #model = AutoModelForCausalLM.from_pretrained('Aryanne/Wizard-Orca-3B-gguf', model_type='llama', model_file='q4_1-wizard-orca-3b.gguf', **vars(cpu_config)) # **asdict(CtransRunConfig_cpu())
|
| 94 |
+
# model = AutoModelForCausalLM.from_pretrained('TheBloke/Mistral-7B-OpenOrca-GGUF', model_type='mistral', model_file='mistral-7b-openorca.Q4_K_M.gguf', **vars(cpu_config)) # **asdict(CtransRunConfig_cpu())
|
| 95 |
+
# #model = AutoModelForCausalLM.from_pretrained('TheBloke/MistralLite-7B-GGUF', model_type='mistral', model_file='mistrallite.Q4_K_M.gguf', **vars(cpu_config)) # **asdict(CtransRunConfig_cpu())
|
| 96 |
+
|
| 97 |
+
# tokenizer = []
|
| 98 |
+
|
| 99 |
+
# if model_type == "Flan Alpaca (small, fast)":
|
| 100 |
+
# # Huggingface chat model
|
| 101 |
+
# hf_checkpoint = 'declare-lab/flan-alpaca-large'#'declare-lab/flan-alpaca-base' # # #
|
| 102 |
+
|
| 103 |
+
# def create_hf_model(model_name):
|
| 104 |
+
|
| 105 |
+
# from transformers import AutoModelForSeq2SeqLM, AutoModelForCausalLM
|
| 106 |
+
|
| 107 |
+
# if torch_device == "cuda":
|
| 108 |
+
# if "flan" in model_name:
|
| 109 |
+
# model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto")
|
| 110 |
+
# else:
|
| 111 |
+
# model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
|
| 112 |
+
# else:
|
| 113 |
+
# if "flan" in model_name:
|
| 114 |
+
# model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
| 115 |
+
# else:
|
| 116 |
+
# model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
|
| 117 |
+
|
| 118 |
+
# tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length = chatf.context_length)
|
| 119 |
+
|
| 120 |
+
# return model, tokenizer, model_type
|
| 121 |
+
|
| 122 |
+
# model, tokenizer, model_type = create_hf_model(model_name = hf_checkpoint)
|
| 123 |
+
|
| 124 |
+
# chatf.model = model
|
| 125 |
+
# chatf.tokenizer = tokenizer
|
| 126 |
+
# chatf.model_type = model_type
|
| 127 |
+
|
| 128 |
+
# load_confirmation = "Finished loading model: " + model_type
|
| 129 |
+
|
| 130 |
+
# print(load_confirmation)
|
| 131 |
+
# return model_type, load_confirmation, model_type
|
| 132 |
+
|
| 133 |
+
|
| 134 |
def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_device=None):
|
| 135 |
print("Loading model")
|
| 136 |
|
|
|
|
| 145 |
if model_type == "Mistral Open Orca (larger, slow)":
|
| 146 |
if torch_device == "cuda":
|
| 147 |
gpu_config.update_gpu(gpu_layers)
|
| 148 |
+
print("Loading with", gpu_config.n_gpu_layers, "model layers sent to GPU.")
|
| 149 |
else:
|
| 150 |
gpu_config.update_gpu(gpu_layers)
|
| 151 |
cpu_config.update_gpu(gpu_layers)
|
| 152 |
|
| 153 |
+
print("Loading with", cpu_config.n_gpu_layers, "model layers sent to GPU.")
|
| 154 |
|
| 155 |
print(vars(gpu_config))
|
| 156 |
print(vars(cpu_config))
|
| 157 |
|
| 158 |
try:
|
| 159 |
+
model = Llama(
|
| 160 |
+
model_path=hf_hub_download(
|
| 161 |
+
repo_id=os.environ.get("REPO_ID", "TheBloke/Mistral-7B-OpenOrca-GGUF"),
|
| 162 |
+
filename=os.environ.get("MODEL_FILE", "mistral-7b-openorca.Q4_K_M.gguf"),
|
| 163 |
+
),
|
| 164 |
+
**vars(gpu_config) # change n_gpu_layers if you have more or less VRAM
|
| 165 |
+
)
|
| 166 |
|
| 167 |
+
except Exception as e:
|
| 168 |
+
print("GPU load failed")
|
| 169 |
+
print(e)
|
| 170 |
+
model = Llama(
|
| 171 |
+
model_path=hf_hub_download(
|
| 172 |
+
repo_id=os.environ.get("REPO_ID", "TheBloke/Mistral-7B-OpenOrca-GGUF"),
|
| 173 |
+
filename=os.environ.get("MODEL_FILE", "mistral-7b-openorca.Q4_K_M.gguf"),
|
| 174 |
+
),
|
| 175 |
+
**vars(cpu_config)
|
| 176 |
+
)
|
| 177 |
|
| 178 |
tokenizer = []
|
| 179 |
|
|
|
|
| 187 |
|
| 188 |
if torch_device == "cuda":
|
| 189 |
if "flan" in model_name:
|
| 190 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
|
| 191 |
else:
|
| 192 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
|
| 193 |
else:
|
| 194 |
if "flan" in model_name:
|
| 195 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.float16)
|
| 196 |
else:
|
| 197 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16)
|
| 198 |
|
| 199 |
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length = chatf.context_length)
|
| 200 |
|
|
|
|
| 266 |
#chat_height = 500
|
| 267 |
chatbot = gr.Chatbot(avatar_images=('user.jfif', 'bot.jpg'),bubble_full_width = False, scale = 1) # , height=chat_height
|
| 268 |
with gr.Accordion("Open this tab to see the source paragraphs used to generate the answer", open = False):
|
| 269 |
+
sources = gr.HTML(value = "Source paragraphs with the most relevant text will appear here") # , height=chat_height
|
| 270 |
|
| 271 |
with gr.Row():
|
| 272 |
message = gr.Textbox(
|
|
|
|
| 320 |
|
| 321 |
|
| 322 |
gr.HTML(
|
| 323 |
+
"<center>This app is based on the models Flan Alpaca and Mistral Open Orca. It powered by Gradio, Transformers, and Llama.cpp.</a></center>"
|
| 324 |
)
|
| 325 |
|
| 326 |
examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
|
|
|
|
| 376 |
# Thumbs up or thumbs down voting function
|
| 377 |
chatbot.like(chatf.vote, [chat_history_state, instruction_prompt_out, model_type_state], None)
|
| 378 |
|
| 379 |
+
block.queue().launch(debug=True)
|
|
|
|
|
|
chatfuncs/chatfuncs.py
CHANGED
|
@@ -38,6 +38,11 @@ from gensim.corpora import Dictionary
|
|
| 38 |
from gensim.models import TfidfModel, OkapiBM25Model
|
| 39 |
from gensim.similarities import SparseMatrixSimilarity
|
| 40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
import gradio as gr
|
| 42 |
|
| 43 |
torch.cuda.empty_cache()
|
|
@@ -70,7 +75,7 @@ kw_model = pipeline("feature-extraction", model="sentence-transformers/all-MiniL
|
|
| 70 |
# Currently set gpu_layers to 0 even with cuda due to persistent bugs in implementation with cuda
|
| 71 |
if torch.cuda.is_available():
|
| 72 |
torch_device = "cuda"
|
| 73 |
-
gpu_layers =
|
| 74 |
else:
|
| 75 |
torch_device = "cpu"
|
| 76 |
gpu_layers = 0
|
|
@@ -96,67 +101,129 @@ context_length:int = 2048
|
|
| 96 |
sample = True
|
| 97 |
|
| 98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
class CtransInitConfig_gpu:
|
| 100 |
-
def __init__(self, temperature=temperature,
|
| 101 |
-
top_k=top_k,
|
| 102 |
-
top_p=top_p,
|
| 103 |
-
repetition_penalty=repetition_penalty,
|
| 104 |
last_n_tokens=last_n_tokens,
|
| 105 |
-
max_new_tokens=max_new_tokens,
|
| 106 |
seed=seed,
|
| 107 |
-
reset=reset,
|
| 108 |
-
stream=stream,
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
self.temperature = temperature
|
| 114 |
-
self.top_k = top_k
|
| 115 |
-
self.top_p = top_p
|
| 116 |
-
self.repetition_penalty = repetition_penalty# repetition_penalty
|
| 117 |
self.last_n_tokens = last_n_tokens
|
| 118 |
-
self.max_new_tokens = max_new_tokens
|
| 119 |
self.seed = seed
|
| 120 |
-
self.reset = reset
|
| 121 |
-
self.stream = stream
|
| 122 |
-
self.
|
| 123 |
-
self.
|
| 124 |
-
self.
|
| 125 |
-
self.
|
| 126 |
# self.stop: list[str] = field(default_factory=lambda: [stop_string])
|
| 127 |
|
| 128 |
def update_gpu(self, new_value):
|
| 129 |
-
self.
|
| 130 |
|
| 131 |
class CtransInitConfig_cpu(CtransInitConfig_gpu):
|
| 132 |
def __init__(self):
|
| 133 |
super().__init__()
|
| 134 |
-
self.
|
| 135 |
|
| 136 |
gpu_config = CtransInitConfig_gpu()
|
| 137 |
cpu_config = CtransInitConfig_cpu()
|
| 138 |
|
| 139 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
class CtransGenGenerationConfig:
|
| 141 |
def __init__(self, temperature=temperature,
|
| 142 |
top_k=top_k,
|
| 143 |
top_p=top_p,
|
| 144 |
-
|
| 145 |
-
last_n_tokens=last_n_tokens,
|
| 146 |
seed=seed,
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
|
|
|
|
|
|
| 150 |
):
|
| 151 |
self.temperature = temperature
|
| 152 |
self.top_k = top_k
|
| 153 |
self.top_p = top_p
|
| 154 |
-
self.
|
| 155 |
-
self.last_n_tokens = last_n_tokens
|
| 156 |
self.seed = seed
|
| 157 |
-
self.
|
| 158 |
-
self.
|
| 159 |
-
self.
|
|
|
|
|
|
|
| 160 |
|
| 161 |
def update_temp(self, new_value):
|
| 162 |
self.temperature = new_value
|
|
@@ -352,6 +419,94 @@ def create_full_prompt(user_input, history, extracted_memory, vectorstore, embed
|
|
| 352 |
return history, docs_content_string, instruction_prompt_out
|
| 353 |
|
| 354 |
# Chat functions
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 355 |
def produce_streaming_answer_chatbot(history, full_prompt, model_type,
|
| 356 |
temperature=temperature,
|
| 357 |
max_new_tokens=max_new_tokens,
|
|
@@ -412,7 +567,9 @@ def produce_streaming_answer_chatbot(history, full_prompt, model_type,
|
|
| 412 |
print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
|
| 413 |
|
| 414 |
elif model_type == "Mistral Open Orca (larger, slow)":
|
| 415 |
-
tokens = model.tokenize(full_prompt)
|
|
|
|
|
|
|
| 416 |
|
| 417 |
gen_config = CtransGenGenerationConfig()
|
| 418 |
gen_config.update_temp(temperature)
|
|
@@ -424,13 +581,19 @@ def produce_streaming_answer_chatbot(history, full_prompt, model_type,
|
|
| 424 |
NUM_TOKENS=0
|
| 425 |
print('-'*4+'Start Generation'+'-'*4)
|
| 426 |
|
|
|
|
|
|
|
|
|
|
| 427 |
history[-1][1] = ""
|
| 428 |
-
for
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
|
|
|
|
|
|
|
|
|
| 434 |
time_generate = time.time() - start
|
| 435 |
print('\n')
|
| 436 |
print('-'*4+'End Generation'+'-'*4)
|
|
@@ -439,6 +602,7 @@ def produce_streaming_answer_chatbot(history, full_prompt, model_type,
|
|
| 439 |
print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
|
| 440 |
print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
|
| 441 |
|
|
|
|
| 442 |
# Chat helper functions
|
| 443 |
|
| 444 |
def adapt_q_from_chat_history(question, chat_history, extracted_memory, keyword_model=""):#keyword_model): # new_question_keywords,
|
|
@@ -614,7 +778,7 @@ def hybrid_retrieval(new_question_kworded, vectorstore, embeddings, k_val, out_p
|
|
| 614 |
# 3rd level check on retrieved docs with SVM retriever
|
| 615 |
|
| 616 |
svm_retriever = SVMRetriever.from_texts(content_keep, embeddings, k = k_val)
|
| 617 |
-
svm_result = svm_retriever.
|
| 618 |
|
| 619 |
|
| 620 |
svm_rank=[]
|
|
@@ -994,10 +1158,10 @@ def restore_interactivity():
|
|
| 994 |
return gr.update(interactive=True)
|
| 995 |
|
| 996 |
def update_message(dropdown_value):
|
| 997 |
-
return gr.Textbox
|
| 998 |
|
| 999 |
def hide_block():
|
| 1000 |
-
return gr.Radio
|
| 1001 |
|
| 1002 |
# Vote function
|
| 1003 |
|
|
|
|
| 38 |
from gensim.models import TfidfModel, OkapiBM25Model
|
| 39 |
from gensim.similarities import SparseMatrixSimilarity
|
| 40 |
|
| 41 |
+
import copy
|
| 42 |
+
import llama_cpp
|
| 43 |
+
from llama_cpp import Llama
|
| 44 |
+
from huggingface_hub import hf_hub_download
|
| 45 |
+
|
| 46 |
import gradio as gr
|
| 47 |
|
| 48 |
torch.cuda.empty_cache()
|
|
|
|
| 75 |
# Currently set gpu_layers to 0 even with cuda due to persistent bugs in implementation with cuda
|
| 76 |
if torch.cuda.is_available():
|
| 77 |
torch_device = "cuda"
|
| 78 |
+
gpu_layers = 100
|
| 79 |
else:
|
| 80 |
torch_device = "cpu"
|
| 81 |
gpu_layers = 0
|
|
|
|
| 101 |
sample = True
|
| 102 |
|
| 103 |
|
| 104 |
+
# class CtransInitConfig_gpu:
|
| 105 |
+
# def __init__(self, temperature=temperature,
|
| 106 |
+
# top_k=top_k,
|
| 107 |
+
# top_p=top_p,
|
| 108 |
+
# repetition_penalty=repetition_penalty,
|
| 109 |
+
# last_n_tokens=last_n_tokens,
|
| 110 |
+
# max_new_tokens=max_new_tokens,
|
| 111 |
+
# seed=seed,
|
| 112 |
+
# reset=reset,
|
| 113 |
+
# stream=stream,
|
| 114 |
+
# threads=threads,
|
| 115 |
+
# batch_size=batch_size,
|
| 116 |
+
# context_length=context_length,
|
| 117 |
+
# gpu_layers=gpu_layers):
|
| 118 |
+
# self.temperature = temperature
|
| 119 |
+
# self.top_k = top_k
|
| 120 |
+
# self.top_p = top_p
|
| 121 |
+
# self.repetition_penalty = repetition_penalty# repetition_penalty
|
| 122 |
+
# self.last_n_tokens = last_n_tokens
|
| 123 |
+
# self.max_new_tokens = max_new_tokens
|
| 124 |
+
# self.seed = seed
|
| 125 |
+
# self.reset = reset
|
| 126 |
+
# self.stream = stream
|
| 127 |
+
# self.threads = threads
|
| 128 |
+
# self.batch_size = batch_size
|
| 129 |
+
# self.context_length = context_length
|
| 130 |
+
# self.gpu_layers = gpu_layers
|
| 131 |
+
# # self.stop: list[str] = field(default_factory=lambda: [stop_string])
|
| 132 |
+
|
| 133 |
+
# def update_gpu(self, new_value):
|
| 134 |
+
# self.gpu_layers = new_value
|
| 135 |
+
|
| 136 |
+
# class CtransInitConfig_cpu(CtransInitConfig_gpu):
|
| 137 |
+
# def __init__(self):
|
| 138 |
+
# super().__init__()
|
| 139 |
+
# self.gpu_layers = 0
|
| 140 |
+
|
| 141 |
class CtransInitConfig_gpu:
|
| 142 |
+
def __init__(self, #temperature=temperature,
|
| 143 |
+
#top_k=top_k,
|
| 144 |
+
#top_p=top_p,
|
| 145 |
+
#repetition_penalty=repetition_penalty,
|
| 146 |
last_n_tokens=last_n_tokens,
|
| 147 |
+
#max_new_tokens=max_new_tokens,
|
| 148 |
seed=seed,
|
| 149 |
+
#reset=reset,
|
| 150 |
+
#stream=stream,
|
| 151 |
+
n_threads=threads,
|
| 152 |
+
n_batch=batch_size,
|
| 153 |
+
n_ctx=4096,
|
| 154 |
+
n_gpu_layers=gpu_layers):
|
| 155 |
+
#self.temperature = temperature
|
| 156 |
+
#self.top_k = top_k
|
| 157 |
+
#self.top_p = top_p
|
| 158 |
+
#self.repetition_penalty = repetition_penalty# repetition_penalty
|
| 159 |
self.last_n_tokens = last_n_tokens
|
| 160 |
+
#self.max_new_tokens = max_new_tokens
|
| 161 |
self.seed = seed
|
| 162 |
+
#self.reset = reset
|
| 163 |
+
#self.stream = stream
|
| 164 |
+
self.n_threads = n_threads
|
| 165 |
+
self.n_batch = n_batch
|
| 166 |
+
self.n_ctx = n_ctx
|
| 167 |
+
self.n_gpu_layers = n_gpu_layers
|
| 168 |
# self.stop: list[str] = field(default_factory=lambda: [stop_string])
|
| 169 |
|
| 170 |
def update_gpu(self, new_value):
|
| 171 |
+
self.n_gpu_layers = new_value
|
| 172 |
|
| 173 |
class CtransInitConfig_cpu(CtransInitConfig_gpu):
|
| 174 |
def __init__(self):
|
| 175 |
super().__init__()
|
| 176 |
+
self.n_gpu_layers = 0
|
| 177 |
|
| 178 |
gpu_config = CtransInitConfig_gpu()
|
| 179 |
cpu_config = CtransInitConfig_cpu()
|
| 180 |
|
| 181 |
|
| 182 |
+
# class CtransGenGenerationConfig:
|
| 183 |
+
# def __init__(self, temperature=temperature,
|
| 184 |
+
# top_k=top_k,
|
| 185 |
+
# top_p=top_p,
|
| 186 |
+
# repetition_penalty=repetition_penalty,
|
| 187 |
+
# last_n_tokens=last_n_tokens,
|
| 188 |
+
# seed=seed,
|
| 189 |
+
# threads=threads,
|
| 190 |
+
# batch_size=batch_size,
|
| 191 |
+
# reset=True
|
| 192 |
+
# ):
|
| 193 |
+
# self.temperature = temperature
|
| 194 |
+
# self.top_k = top_k
|
| 195 |
+
# self.top_p = top_p
|
| 196 |
+
# self.repetition_penalty = repetition_penalty# repetition_penalty
|
| 197 |
+
# self.last_n_tokens = last_n_tokens
|
| 198 |
+
# self.seed = seed
|
| 199 |
+
# self.threads = threads
|
| 200 |
+
# self.batch_size = batch_size
|
| 201 |
+
# self.reset = reset
|
| 202 |
+
|
| 203 |
class CtransGenGenerationConfig:
|
| 204 |
def __init__(self, temperature=temperature,
|
| 205 |
top_k=top_k,
|
| 206 |
top_p=top_p,
|
| 207 |
+
repeat_penalty=repetition_penalty,
|
| 208 |
+
#last_n_tokens=last_n_tokens,
|
| 209 |
seed=seed,
|
| 210 |
+
stream=stream,
|
| 211 |
+
max_tokens=max_new_tokens
|
| 212 |
+
#threads=threads,
|
| 213 |
+
#batch_size=batch_size,
|
| 214 |
+
#reset=True
|
| 215 |
):
|
| 216 |
self.temperature = temperature
|
| 217 |
self.top_k = top_k
|
| 218 |
self.top_p = top_p
|
| 219 |
+
self.repeat_penalty = repeat_penalty
|
| 220 |
+
#self.last_n_tokens = last_n_tokens
|
| 221 |
self.seed = seed
|
| 222 |
+
self.max_tokens=max_tokens
|
| 223 |
+
self.stream = stream
|
| 224 |
+
#self.threads = threads
|
| 225 |
+
#self.batch_size = batch_size
|
| 226 |
+
#self.reset = reset
|
| 227 |
|
| 228 |
def update_temp(self, new_value):
|
| 229 |
self.temperature = new_value
|
|
|
|
| 419 |
return history, docs_content_string, instruction_prompt_out
|
| 420 |
|
| 421 |
# Chat functions
|
| 422 |
+
# def produce_streaming_answer_chatbot(history, full_prompt, model_type,
|
| 423 |
+
# temperature=temperature,
|
| 424 |
+
# max_new_tokens=max_new_tokens,
|
| 425 |
+
# sample=sample,
|
| 426 |
+
# repetition_penalty=repetition_penalty,
|
| 427 |
+
# top_p=top_p,
|
| 428 |
+
# top_k=top_k
|
| 429 |
+
# ):
|
| 430 |
+
# #print("Model type is: ", model_type)
|
| 431 |
+
|
| 432 |
+
# #if not full_prompt.strip():
|
| 433 |
+
# # if history is None:
|
| 434 |
+
# # history = []
|
| 435 |
+
|
| 436 |
+
# # return history
|
| 437 |
+
|
| 438 |
+
# if model_type == "Flan Alpaca (small, fast)":
|
| 439 |
+
# # Get the model and tokenizer, and tokenize the user text.
|
| 440 |
+
# model_inputs = tokenizer(text=full_prompt, return_tensors="pt", return_attention_mask=False).to(torch_device) # return_attention_mask=False was added
|
| 441 |
+
|
| 442 |
+
# # Start generation on a separate thread, so that we don't block the UI. The text is pulled from the streamer
|
| 443 |
+
# # in the main thread. Adds timeout to the streamer to handle exceptions in the generation thread.
|
| 444 |
+
# streamer = TextIteratorStreamer(tokenizer, timeout=120., skip_prompt=True, skip_special_tokens=True)
|
| 445 |
+
# generate_kwargs = dict(
|
| 446 |
+
# model_inputs,
|
| 447 |
+
# streamer=streamer,
|
| 448 |
+
# max_new_tokens=max_new_tokens,
|
| 449 |
+
# do_sample=sample,
|
| 450 |
+
# repetition_penalty=repetition_penalty,
|
| 451 |
+
# top_p=top_p,
|
| 452 |
+
# temperature=temperature,
|
| 453 |
+
# top_k=top_k
|
| 454 |
+
# )
|
| 455 |
+
|
| 456 |
+
# print(generate_kwargs)
|
| 457 |
+
|
| 458 |
+
# t = Thread(target=model.generate, kwargs=generate_kwargs)
|
| 459 |
+
# t.start()
|
| 460 |
+
|
| 461 |
+
# # Pull the generated text from the streamer, and update the model output.
|
| 462 |
+
# start = time.time()
|
| 463 |
+
# NUM_TOKENS=0
|
| 464 |
+
# print('-'*4+'Start Generation'+'-'*4)
|
| 465 |
+
|
| 466 |
+
# history[-1][1] = ""
|
| 467 |
+
# for new_text in streamer:
|
| 468 |
+
# if new_text == None: new_text = ""
|
| 469 |
+
# history[-1][1] += new_text
|
| 470 |
+
# NUM_TOKENS+=1
|
| 471 |
+
# yield history
|
| 472 |
+
|
| 473 |
+
# time_generate = time.time() - start
|
| 474 |
+
# print('\n')
|
| 475 |
+
# print('-'*4+'End Generation'+'-'*4)
|
| 476 |
+
# print(f'Num of generated tokens: {NUM_TOKENS}')
|
| 477 |
+
# print(f'Time for complete generation: {time_generate}s')
|
| 478 |
+
# print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
|
| 479 |
+
# print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
|
| 480 |
+
|
| 481 |
+
# elif model_type == "Mistral Open Orca (larger, slow)":
|
| 482 |
+
# tokens = model.tokenize(full_prompt)
|
| 483 |
+
|
| 484 |
+
# gen_config = CtransGenGenerationConfig()
|
| 485 |
+
# gen_config.update_temp(temperature)
|
| 486 |
+
|
| 487 |
+
# print(vars(gen_config))
|
| 488 |
+
|
| 489 |
+
# # Pull the generated text from the streamer, and update the model output.
|
| 490 |
+
# start = time.time()
|
| 491 |
+
# NUM_TOKENS=0
|
| 492 |
+
# print('-'*4+'Start Generation'+'-'*4)
|
| 493 |
+
|
| 494 |
+
# history[-1][1] = ""
|
| 495 |
+
# for new_text in model.generate(tokens, **vars(gen_config)): #CtransGen_generate(prompt=full_prompt)#, config=CtransGenGenerationConfig()): # #top_k=top_k, temperature=temperature, repetition_penalty=repetition_penalty,
|
| 496 |
+
# if new_text == None: new_text = ""
|
| 497 |
+
# history[-1][1] += model.detokenize(new_text) #new_text
|
| 498 |
+
# NUM_TOKENS+=1
|
| 499 |
+
# yield history
|
| 500 |
+
|
| 501 |
+
# time_generate = time.time() - start
|
| 502 |
+
# print('\n')
|
| 503 |
+
# print('-'*4+'End Generation'+'-'*4)
|
| 504 |
+
# print(f'Num of generated tokens: {NUM_TOKENS}')
|
| 505 |
+
# print(f'Time for complete generation: {time_generate}s')
|
| 506 |
+
# print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
|
| 507 |
+
# print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
|
| 508 |
+
|
| 509 |
+
|
| 510 |
def produce_streaming_answer_chatbot(history, full_prompt, model_type,
|
| 511 |
temperature=temperature,
|
| 512 |
max_new_tokens=max_new_tokens,
|
|
|
|
| 567 |
print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
|
| 568 |
|
| 569 |
elif model_type == "Mistral Open Orca (larger, slow)":
|
| 570 |
+
#tokens = model.tokenize(full_prompt)
|
| 571 |
+
|
| 572 |
+
temp = ""
|
| 573 |
|
| 574 |
gen_config = CtransGenGenerationConfig()
|
| 575 |
gen_config.update_temp(temperature)
|
|
|
|
| 581 |
NUM_TOKENS=0
|
| 582 |
print('-'*4+'Start Generation'+'-'*4)
|
| 583 |
|
| 584 |
+
output = model(
|
| 585 |
+
full_prompt, **vars(gen_config))
|
| 586 |
+
|
| 587 |
history[-1][1] = ""
|
| 588 |
+
for out in output:
|
| 589 |
+
|
| 590 |
+
if "choices" in out and len(out["choices"]) > 0 and "text" in out["choices"][0]:
|
| 591 |
+
history[-1][1] += out["choices"][0]["text"]
|
| 592 |
+
NUM_TOKENS+=1
|
| 593 |
+
yield history
|
| 594 |
+
else:
|
| 595 |
+
print(f"Unexpected output structure: {out}")
|
| 596 |
+
|
| 597 |
time_generate = time.time() - start
|
| 598 |
print('\n')
|
| 599 |
print('-'*4+'End Generation'+'-'*4)
|
|
|
|
| 602 |
print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
|
| 603 |
print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
|
| 604 |
|
| 605 |
+
|
| 606 |
# Chat helper functions
|
| 607 |
|
| 608 |
def adapt_q_from_chat_history(question, chat_history, extracted_memory, keyword_model=""):#keyword_model): # new_question_keywords,
|
|
|
|
| 778 |
# 3rd level check on retrieved docs with SVM retriever
|
| 779 |
|
| 780 |
svm_retriever = SVMRetriever.from_texts(content_keep, embeddings, k = k_val)
|
| 781 |
+
svm_result = svm_retriever.invoke(new_question_kworded)
|
| 782 |
|
| 783 |
|
| 784 |
svm_rank=[]
|
|
|
|
| 1158 |
return gr.update(interactive=True)
|
| 1159 |
|
| 1160 |
def update_message(dropdown_value):
|
| 1161 |
+
return gr.Textbox(value=dropdown_value)
|
| 1162 |
|
| 1163 |
def hide_block():
|
| 1164 |
+
return gr.Radio(visible=False)
|
| 1165 |
|
| 1166 |
# Vote function
|
| 1167 |
|
chatfuncs/ingest.py
CHANGED
|
@@ -21,7 +21,7 @@ from pypdf import PdfReader
|
|
| 21 |
PandasDataFrame = Type[pd.DataFrame]
|
| 22 |
|
| 23 |
split_strat = ["\n\n", "\n", ". ", "! ", "? "]
|
| 24 |
-
chunk_size =
|
| 25 |
chunk_overlap = 0
|
| 26 |
start_index = True
|
| 27 |
|
|
|
|
| 21 |
PandasDataFrame = Type[pd.DataFrame]
|
| 22 |
|
| 23 |
split_strat = ["\n\n", "\n", ". ", "! ", "? "]
|
| 24 |
+
chunk_size = 300
|
| 25 |
chunk_overlap = 0
|
| 26 |
start_index = True
|
| 27 |
|
requirements.txt
CHANGED
|
@@ -3,15 +3,18 @@ langchain-community
|
|
| 3 |
beautifulsoup4
|
| 4 |
pandas
|
| 5 |
transformers
|
| 6 |
-
--extra-index-url https://
|
| 7 |
-
torch
|
|
|
|
| 8 |
sentence_transformers
|
| 9 |
faiss-cpu
|
| 10 |
pypdf
|
| 11 |
python-docx
|
| 12 |
-
ctransformers[cuda]
|
| 13 |
keybert
|
| 14 |
span_marker
|
| 15 |
gensim
|
| 16 |
-
gradio==
|
| 17 |
-
gradio_client
|
|
|
|
|
|
|
|
|
| 3 |
beautifulsoup4
|
| 4 |
pandas
|
| 5 |
transformers
|
| 6 |
+
llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
|
| 7 |
+
#torch \
|
| 8 |
+
#--extra-index-url https://download.pytorch.org/whl/cu121
|
| 9 |
sentence_transformers
|
| 10 |
faiss-cpu
|
| 11 |
pypdf
|
| 12 |
python-docx
|
| 13 |
+
#ctransformers[cuda]
|
| 14 |
keybert
|
| 15 |
span_marker
|
| 16 |
gensim
|
| 17 |
+
gradio==4.31.5
|
| 18 |
+
gradio_client
|
| 19 |
+
nltk
|
| 20 |
+
scipy<1.13
|