Sean-Case
commited on
Commit
·
d2ddc62
1
Parent(s):
f6036ad
Attempt to switch to Orca Mini GGUF
Browse files- app.py +3 -4
- chatfuncs/chatfuncs.py +49 -31
- requirements.txt +1 -1
app.py
CHANGED
|
@@ -11,7 +11,6 @@ from langchain.vectorstores import FAISS
|
|
| 11 |
PandasDataFrame = TypeVar('pd.core.frame.DataFrame')
|
| 12 |
|
| 13 |
# Disable cuda devices if necessary
|
| 14 |
-
|
| 15 |
#os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
|
| 16 |
|
| 17 |
#from chatfuncs.chatfuncs import *
|
|
@@ -155,7 +154,7 @@ with block:
|
|
| 155 |
ingest_embed_out = gr.Textbox(label="File/webpage preparation progress")
|
| 156 |
|
| 157 |
gr.HTML(
|
| 158 |
-
"<center>Powered by
|
| 159 |
)
|
| 160 |
|
| 161 |
examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
|
|
@@ -177,14 +176,14 @@ with block:
|
|
| 177 |
# Click/enter to send message action
|
| 178 |
response_click = submit.click(chatf.get_history_sources_final_input_prompt, inputs=[message, chat_history_state, current_topic, vectorstore_state, embeddings_state], outputs=[chat_history_state, sources, instruction_prompt_out], queue=False, api_name="retrieval").\
|
| 179 |
then(chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
|
| 180 |
-
then(chatf.
|
| 181 |
response_click.then(chatf.highlight_found_text, [chatbot, sources], [sources]).\
|
| 182 |
then(chatf.add_inputs_answer_to_history,[message, chatbot, current_topic], [chat_history_state, current_topic]).\
|
| 183 |
then(lambda: gr.update(interactive=True), None, [message], queue=False)
|
| 184 |
|
| 185 |
response_enter = message.submit(chatf.get_history_sources_final_input_prompt, inputs=[message, chat_history_state, current_topic, vectorstore_state, embeddings_state], outputs=[chat_history_state, sources, instruction_prompt_out], queue=False).\
|
| 186 |
then(chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
|
| 187 |
-
then(chatf.
|
| 188 |
response_enter.then(chatf.highlight_found_text, [chatbot, sources], [sources]).\
|
| 189 |
then(chatf.add_inputs_answer_to_history,[message, chatbot, current_topic], [chat_history_state, current_topic]).\
|
| 190 |
then(lambda: gr.update(interactive=True), None, [message], queue=False)
|
|
|
|
| 11 |
PandasDataFrame = TypeVar('pd.core.frame.DataFrame')
|
| 12 |
|
| 13 |
# Disable cuda devices if necessary
|
|
|
|
| 14 |
#os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
|
| 15 |
|
| 16 |
#from chatfuncs.chatfuncs import *
|
|
|
|
| 154 |
ingest_embed_out = gr.Textbox(label="File/webpage preparation progress")
|
| 155 |
|
| 156 |
gr.HTML(
|
| 157 |
+
"<center>Powered by Orca Mini and Langchain</a></center>"
|
| 158 |
)
|
| 159 |
|
| 160 |
examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
|
|
|
|
| 176 |
# Click/enter to send message action
|
| 177 |
response_click = submit.click(chatf.get_history_sources_final_input_prompt, inputs=[message, chat_history_state, current_topic, vectorstore_state, embeddings_state], outputs=[chat_history_state, sources, instruction_prompt_out], queue=False, api_name="retrieval").\
|
| 178 |
then(chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
|
| 179 |
+
then(chatf.produce_streaming_answer_chatbot_ctrans, inputs=[chatbot, instruction_prompt_out], outputs=chatbot)
|
| 180 |
response_click.then(chatf.highlight_found_text, [chatbot, sources], [sources]).\
|
| 181 |
then(chatf.add_inputs_answer_to_history,[message, chatbot, current_topic], [chat_history_state, current_topic]).\
|
| 182 |
then(lambda: gr.update(interactive=True), None, [message], queue=False)
|
| 183 |
|
| 184 |
response_enter = message.submit(chatf.get_history_sources_final_input_prompt, inputs=[message, chat_history_state, current_topic, vectorstore_state, embeddings_state], outputs=[chat_history_state, sources, instruction_prompt_out], queue=False).\
|
| 185 |
then(chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
|
| 186 |
+
then(chatf.produce_streaming_answer_chatbot_ctrans, [chatbot, instruction_prompt_out], chatbot)
|
| 187 |
response_enter.then(chatf.highlight_found_text, [chatbot, sources], [sources]).\
|
| 188 |
then(chatf.add_inputs_answer_to_history,[message, chatbot, current_topic], [chat_history_state, current_topic]).\
|
| 189 |
then(lambda: gr.update(interactive=True), None, [message], queue=False)
|
chatfuncs/chatfuncs.py
CHANGED
|
@@ -7,12 +7,13 @@ import numpy as np
|
|
| 7 |
|
| 8 |
# Model packages
|
| 9 |
import torch
|
|
|
|
| 10 |
from threading import Thread
|
| 11 |
from transformers import AutoTokenizer, pipeline, TextIteratorStreamer
|
| 12 |
|
| 13 |
# Alternative model sources
|
| 14 |
from gpt4all import GPT4All
|
| 15 |
-
from ctransformers import AutoModelForCausalLM
|
| 16 |
|
| 17 |
from dataclasses import asdict, dataclass
|
| 18 |
|
|
@@ -44,7 +45,11 @@ from gensim.similarities import SparseMatrixSimilarity
|
|
| 44 |
|
| 45 |
import gradio as gr
|
| 46 |
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
print("Running on device:", torch_device)
|
| 49 |
threads = 8#torch.get_num_threads()
|
| 50 |
print("CPU threads:", threads)
|
|
@@ -72,9 +77,27 @@ stream: bool = True
|
|
| 72 |
threads: int = threads
|
| 73 |
batch_size:int = 512
|
| 74 |
context_length:int = 2048
|
| 75 |
-
gpu_layers:int = 0
|
| 76 |
sample = True
|
| 77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
## Highlight text constants
|
| 79 |
hlt_chunk_size = 20
|
| 80 |
hlt_strat = [" ", ".", "!", "?", ":", "\n\n", "\n", ","]
|
|
@@ -87,17 +110,20 @@ ner_model = SpanMarkerModel.from_pretrained("tomaarsen/span-marker-mbert-base-mu
|
|
| 87 |
# Used to pull out keywords from chat history to add to user queries behind the scenes
|
| 88 |
kw_model = pipeline("feature-extraction", model="sentence-transformers/all-MiniLM-L6-v2")
|
| 89 |
|
|
|
|
|
|
|
| 90 |
## Chat models ##
|
| 91 |
ctrans_llm = [] # Not leaded by default
|
| 92 |
#ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/orca_mini_3B-GGML', model_type='llama', model_file='orca-mini-3b.ggmlv3.q4_0.bin')
|
| 93 |
-
|
| 94 |
#ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/vicuna-13B-v1.5-16K-GGUF', model_type='llama', model_file='vicuna-13b-v1.5-16k.Q4_K_M.gguf')
|
| 95 |
#ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/CodeUp-Llama-2-13B-Chat-HF-GGUF', model_type='llama', model_file='codeup-llama-2-13b-chat-hf.Q4_K_M.gguf')
|
| 96 |
#ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/CodeLlama-13B-Instruct-GGUF', model_type='llama', model_file='codellama-13b-instruct.Q4_K_M.gguf')
|
| 97 |
#ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/Mistral-7B-Instruct-v0.1-GGUF', model_type='mistral', model_file='mistral-7b-instruct-v0.1.Q4_K_M.gguf')
|
| 98 |
#ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/Mistral-7B-OpenOrca-GGUF', model_type='mistral', model_file='mistral-7b-openorca.Q4_K_M.gguf')
|
| 99 |
|
| 100 |
-
|
|
|
|
| 101 |
|
| 102 |
# Huggingface chat model
|
| 103 |
#hf_checkpoint = 'jphme/phi-1_5_Wizard_Vicuna_uncensored'
|
|
@@ -128,7 +154,7 @@ def create_hf_model(model_name):
|
|
| 128 |
|
| 129 |
return model, tokenizer, torch_device
|
| 130 |
|
| 131 |
-
model, tokenizer, torch_device = create_hf_model(model_name = hf_checkpoint)
|
| 132 |
|
| 133 |
# Vectorstore funcs
|
| 134 |
|
|
@@ -196,6 +222,17 @@ def create_prompt_templates():
|
|
| 196 |
|
| 197 |
### Response:"""
|
| 198 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
|
| 200 |
|
| 201 |
|
|
@@ -581,9 +618,6 @@ def create_final_prompt(inputs: Dict[str, str], instruction_prompt, content_prom
|
|
| 581 |
#print("The question passed to the vector search is:")
|
| 582 |
#print(new_question_kworded)
|
| 583 |
|
| 584 |
-
#docs_keep_as_doc, docs_content, docs_url = find_relevant_passages(new_question_kworded, k_val = 5, out_passages = 3,
|
| 585 |
-
# vec_score_cut_off = 1.3, vec_weight = 1, tfidf_weight = 0.5, svm_weight = 1)
|
| 586 |
-
|
| 587 |
docs_keep_as_doc, doc_df, docs_keep_out = hybrid_retrieval(new_question_kworded, vectorstore, embeddings, k_val = 5, out_passages = 2,
|
| 588 |
vec_score_cut_off = 1, vec_weight = 1, bm25_weight = 1, svm_weight = 1)#,
|
| 589 |
#vectorstore=globals()["vectorstore"], embeddings=globals()["embeddings"])
|
|
@@ -868,8 +902,8 @@ def produce_streaming_answer_chatbot_ctrans(history, full_prompt):
|
|
| 868 |
print("The question is: ")
|
| 869 |
print(full_prompt)
|
| 870 |
|
| 871 |
-
|
| 872 |
-
|
| 873 |
#import psutil
|
| 874 |
#from loguru import logger
|
| 875 |
|
|
@@ -884,29 +918,13 @@ def produce_streaming_answer_chatbot_ctrans(history, full_prompt):
|
|
| 884 |
#logger.debug(f"{cpu_count=}")
|
| 885 |
|
| 886 |
# Pull the generated text from the streamer, and update the model output.
|
| 887 |
-
config = GenerationConfig(reset=True)
|
| 888 |
history[-1][1] = ""
|
| 889 |
-
for new_text in ctrans_generate(prompt=
|
| 890 |
-
if new_text == None: new_text =
|
| 891 |
-
history[-1][1] += new_text
|
| 892 |
yield history
|
| 893 |
|
| 894 |
-
@dataclass
|
| 895 |
-
class GenerationConfig:
|
| 896 |
-
temperature: float = temperature
|
| 897 |
-
top_k: int = top_k
|
| 898 |
-
top_p: float = top_p
|
| 899 |
-
repetition_penalty: float = repetition_penalty
|
| 900 |
-
last_n_tokens: int = last_n_tokens
|
| 901 |
-
max_new_tokens: int = max_new_tokens
|
| 902 |
-
#seed: int = 42
|
| 903 |
-
reset: bool = reset
|
| 904 |
-
stream: bool = stream
|
| 905 |
-
threads: int = threads
|
| 906 |
-
batch_size:int = batch_size
|
| 907 |
-
#context_length:int = context_length
|
| 908 |
-
#gpu_layers:int = gpu_layers
|
| 909 |
-
#stop: list[str] = field(default_factory=lambda: [stop_string])
|
| 910 |
|
| 911 |
def ctrans_generate(
|
| 912 |
prompt: str,
|
|
|
|
| 7 |
|
| 8 |
# Model packages
|
| 9 |
import torch
|
| 10 |
+
torch.cuda.empty_cache()
|
| 11 |
from threading import Thread
|
| 12 |
from transformers import AutoTokenizer, pipeline, TextIteratorStreamer
|
| 13 |
|
| 14 |
# Alternative model sources
|
| 15 |
from gpt4all import GPT4All
|
| 16 |
+
from ctransformers import AutoModelForCausalLM#, AutoTokenizer
|
| 17 |
|
| 18 |
from dataclasses import asdict, dataclass
|
| 19 |
|
|
|
|
| 45 |
|
| 46 |
import gradio as gr
|
| 47 |
|
| 48 |
+
if torch.cuda.is_available():
|
| 49 |
+
torch_device = "cuda"
|
| 50 |
+
gpu_layers = 1
|
| 51 |
+
else: torch_device = "cpu"
|
| 52 |
+
|
| 53 |
print("Running on device:", torch_device)
|
| 54 |
threads = 8#torch.get_num_threads()
|
| 55 |
print("CPU threads:", threads)
|
|
|
|
| 77 |
threads: int = threads
|
| 78 |
batch_size:int = 512
|
| 79 |
context_length:int = 2048
|
| 80 |
+
gpu_layers:int = 0#10#gpu_layers
|
| 81 |
sample = True
|
| 82 |
|
| 83 |
+
@dataclass
|
| 84 |
+
class GenerationConfig:
|
| 85 |
+
temperature: float = temperature
|
| 86 |
+
top_k: int = top_k
|
| 87 |
+
top_p: float = top_p
|
| 88 |
+
repetition_penalty: float = repetition_penalty
|
| 89 |
+
last_n_tokens: int = last_n_tokens
|
| 90 |
+
max_new_tokens: int = max_new_tokens
|
| 91 |
+
#seed: int = 42
|
| 92 |
+
reset: bool = reset
|
| 93 |
+
stream: bool = stream
|
| 94 |
+
threads: int = threads
|
| 95 |
+
batch_size:int = batch_size
|
| 96 |
+
context_length:int = context_length
|
| 97 |
+
gpu_layers:int = gpu_layers
|
| 98 |
+
#stop: list[str] = field(default_factory=lambda: [stop_string])
|
| 99 |
+
|
| 100 |
+
|
| 101 |
## Highlight text constants
|
| 102 |
hlt_chunk_size = 20
|
| 103 |
hlt_strat = [" ", ".", "!", "?", ":", "\n\n", "\n", ","]
|
|
|
|
| 110 |
# Used to pull out keywords from chat history to add to user queries behind the scenes
|
| 111 |
kw_model = pipeline("feature-extraction", model="sentence-transformers/all-MiniLM-L6-v2")
|
| 112 |
|
| 113 |
+
|
| 114 |
+
|
| 115 |
## Chat models ##
|
| 116 |
ctrans_llm = [] # Not leaded by default
|
| 117 |
#ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/orca_mini_3B-GGML', model_type='llama', model_file='orca-mini-3b.ggmlv3.q4_0.bin')
|
| 118 |
+
ctrans_llm = AutoModelForCausalLM.from_pretrained('juanjgit/orca_mini_3B-GGUF', model_type='llama', model_file='orca-mini-3b.q4_0.gguf', **asdict(GenerationConfig()))
|
| 119 |
#ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/vicuna-13B-v1.5-16K-GGUF', model_type='llama', model_file='vicuna-13b-v1.5-16k.Q4_K_M.gguf')
|
| 120 |
#ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/CodeUp-Llama-2-13B-Chat-HF-GGUF', model_type='llama', model_file='codeup-llama-2-13b-chat-hf.Q4_K_M.gguf')
|
| 121 |
#ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/CodeLlama-13B-Instruct-GGUF', model_type='llama', model_file='codellama-13b-instruct.Q4_K_M.gguf')
|
| 122 |
#ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/Mistral-7B-Instruct-v0.1-GGUF', model_type='mistral', model_file='mistral-7b-instruct-v0.1.Q4_K_M.gguf')
|
| 123 |
#ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/Mistral-7B-OpenOrca-GGUF', model_type='mistral', model_file='mistral-7b-openorca.Q4_K_M.gguf')
|
| 124 |
|
| 125 |
+
|
| 126 |
+
#ctokenizer = AutoTokenizer.from_pretrained(ctrans_llm)
|
| 127 |
|
| 128 |
# Huggingface chat model
|
| 129 |
#hf_checkpoint = 'jphme/phi-1_5_Wizard_Vicuna_uncensored'
|
|
|
|
| 154 |
|
| 155 |
return model, tokenizer, torch_device
|
| 156 |
|
| 157 |
+
#model, tokenizer, torch_device = create_hf_model(model_name = hf_checkpoint)
|
| 158 |
|
| 159 |
# Vectorstore funcs
|
| 160 |
|
|
|
|
| 222 |
|
| 223 |
### Response:"""
|
| 224 |
|
| 225 |
+
instruction_prompt_template_orca_input = """
|
| 226 |
+
### System:
|
| 227 |
+
You are an AI assistant that follows instruction extremely well. Help as much as you can.
|
| 228 |
+
### User:
|
| 229 |
+
Answer the QUESTION using information from the following input.
|
| 230 |
+
### Input:
|
| 231 |
+
{summaries}
|
| 232 |
+
QUESTION: {question}
|
| 233 |
+
|
| 234 |
+
### Response:"""
|
| 235 |
+
|
| 236 |
|
| 237 |
|
| 238 |
|
|
|
|
| 618 |
#print("The question passed to the vector search is:")
|
| 619 |
#print(new_question_kworded)
|
| 620 |
|
|
|
|
|
|
|
|
|
|
| 621 |
docs_keep_as_doc, doc_df, docs_keep_out = hybrid_retrieval(new_question_kworded, vectorstore, embeddings, k_val = 5, out_passages = 2,
|
| 622 |
vec_score_cut_off = 1, vec_weight = 1, bm25_weight = 1, svm_weight = 1)#,
|
| 623 |
#vectorstore=globals()["vectorstore"], embeddings=globals()["embeddings"])
|
|
|
|
| 902 |
print("The question is: ")
|
| 903 |
print(full_prompt)
|
| 904 |
|
| 905 |
+
tokens = ctrans_llm.tokenize(full_prompt)
|
| 906 |
+
|
| 907 |
#import psutil
|
| 908 |
#from loguru import logger
|
| 909 |
|
|
|
|
| 918 |
#logger.debug(f"{cpu_count=}")
|
| 919 |
|
| 920 |
# Pull the generated text from the streamer, and update the model output.
|
| 921 |
+
#config = GenerationConfig(reset=True)
|
| 922 |
history[-1][1] = ""
|
| 923 |
+
for new_text in ctrans_llm.generate(tokens, top_k=top_k, temperature=temperature, repetition_penalty=repetition_penalty): #ctrans_generate(prompt=tokens, config=config):
|
| 924 |
+
if new_text == None: new_text = ""
|
| 925 |
+
history[-1][1] += ctrans_llm.detokenize(new_text) #new_text
|
| 926 |
yield history
|
| 927 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 928 |
|
| 929 |
def ctrans_generate(
|
| 930 |
prompt: str,
|
requirements.txt
CHANGED
|
@@ -17,7 +17,7 @@ gradio
|
|
| 17 |
gradio_client==0.2.7
|
| 18 |
python-docx
|
| 19 |
gpt4all
|
| 20 |
-
ctransformers
|
| 21 |
keybert
|
| 22 |
span_marker
|
| 23 |
gensim
|
|
|
|
| 17 |
gradio_client==0.2.7
|
| 18 |
python-docx
|
| 19 |
gpt4all
|
| 20 |
+
ctransformers[cuda]
|
| 21 |
keybert
|
| 22 |
span_marker
|
| 23 |
gensim
|