Spaces:

seanpedrickcase
/

Light-PDF-Web-QA-Chatbot

Running

App Files Files Community

Sean-Case commited on Oct 6, 2023

Commit

d2ddc62

1 Parent(s): f6036ad

Attempt to switch to Orca Mini GGUF

Browse files

Files changed (3) hide show

app.py +3 -4
chatfuncs/chatfuncs.py +49 -31
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -11,7 +11,6 @@ from langchain.vectorstores import FAISS
 PandasDataFrame = TypeVar('pd.core.frame.DataFrame')
 # Disable cuda devices if necessary
 #os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
 #from chatfuncs.chatfuncs import *
@@ -155,7 +154,7 @@ with block:
         ingest_embed_out = gr.Textbox(label="File/webpage preparation progress")
     gr.HTML(
-        "<center>Powered by Flan Alpaca and Langchain</a></center>"
     )
     examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
@@ -177,14 +176,14 @@ with block:
     # Click/enter to send message action
     response_click = submit.click(chatf.get_history_sources_final_input_prompt, inputs=[message, chat_history_state, current_topic, vectorstore_state, embeddings_state], outputs=[chat_history_state, sources, instruction_prompt_out], queue=False, api_name="retrieval").\
                 then(chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
-                then(chatf.produce_streaming_answer_chatbot_hf, inputs=[chatbot, instruction_prompt_out], outputs=chatbot)
     response_click.then(chatf.highlight_found_text, [chatbot, sources], [sources]).\
                 then(chatf.add_inputs_answer_to_history,[message, chatbot, current_topic], [chat_history_state, current_topic]).\
                 then(lambda: gr.update(interactive=True), None, [message], queue=False)
     response_enter = message.submit(chatf.get_history_sources_final_input_prompt, inputs=[message, chat_history_state, current_topic, vectorstore_state, embeddings_state], outputs=[chat_history_state, sources, instruction_prompt_out], queue=False).\
                 then(chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
-                then(chatf.produce_streaming_answer_chatbot_hf, [chatbot, instruction_prompt_out], chatbot)
     response_enter.then(chatf.highlight_found_text, [chatbot, sources], [sources]).\
                 then(chatf.add_inputs_answer_to_history,[message, chatbot, current_topic], [chat_history_state, current_topic]).\
                 then(lambda: gr.update(interactive=True), None, [message], queue=False)

 PandasDataFrame = TypeVar('pd.core.frame.DataFrame')
 # Disable cuda devices if necessary
 #os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
 #from chatfuncs.chatfuncs import *
         ingest_embed_out = gr.Textbox(label="File/webpage preparation progress")
     gr.HTML(
+        "<center>Powered by Orca Mini and Langchain</a></center>"
     )
     examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
     # Click/enter to send message action
     response_click = submit.click(chatf.get_history_sources_final_input_prompt, inputs=[message, chat_history_state, current_topic, vectorstore_state, embeddings_state], outputs=[chat_history_state, sources, instruction_prompt_out], queue=False, api_name="retrieval").\
                 then(chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
+                then(chatf.produce_streaming_answer_chatbot_ctrans, inputs=[chatbot, instruction_prompt_out], outputs=chatbot)
     response_click.then(chatf.highlight_found_text, [chatbot, sources], [sources]).\
                 then(chatf.add_inputs_answer_to_history,[message, chatbot, current_topic], [chat_history_state, current_topic]).\
                 then(lambda: gr.update(interactive=True), None, [message], queue=False)
     response_enter = message.submit(chatf.get_history_sources_final_input_prompt, inputs=[message, chat_history_state, current_topic, vectorstore_state, embeddings_state], outputs=[chat_history_state, sources, instruction_prompt_out], queue=False).\
                 then(chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
+                then(chatf.produce_streaming_answer_chatbot_ctrans, [chatbot, instruction_prompt_out], chatbot)
     response_enter.then(chatf.highlight_found_text, [chatbot, sources], [sources]).\
                 then(chatf.add_inputs_answer_to_history,[message, chatbot, current_topic], [chat_history_state, current_topic]).\
                 then(lambda: gr.update(interactive=True), None, [message], queue=False)

chatfuncs/chatfuncs.py CHANGED Viewed

@@ -7,12 +7,13 @@ import numpy as np
 # Model packages
 import torch
 from threading import Thread
 from transformers import AutoTokenizer, pipeline, TextIteratorStreamer
 # Alternative model sources
 from gpt4all import GPT4All
-from ctransformers import AutoModelForCausalLM
 from dataclasses import asdict, dataclass
@@ -44,7 +45,11 @@ from gensim.similarities import SparseMatrixSimilarity
 import gradio as gr
-torch_device = "cuda" if torch.cuda.is_available() else "cpu"
 print("Running on device:", torch_device)
 threads = 8#torch.get_num_threads()
 print("CPU threads:", threads)
@@ -72,9 +77,27 @@ stream: bool = True
 threads: int = threads
 batch_size:int = 512
 context_length:int = 2048
-gpu_layers:int = 0
 sample = True
 ## Highlight text constants
 hlt_chunk_size = 20
 hlt_strat = [" ", ".", "!", "?", ":", "\n\n", "\n", ","]
@@ -87,17 +110,20 @@ ner_model = SpanMarkerModel.from_pretrained("tomaarsen/span-marker-mbert-base-mu
 # Used to pull out keywords from chat history to add to user queries behind the scenes
 kw_model = pipeline("feature-extraction", model="sentence-transformers/all-MiniLM-L6-v2")
 ## Chat models ##
 ctrans_llm = [] # Not leaded by default
 #ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/orca_mini_3B-GGML', model_type='llama', model_file='orca-mini-3b.ggmlv3.q4_0.bin')
-#ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/orca_mini_3B-GGML', model_type='llama', model_file='orca-mini-3b.ggmlv3.q8_0.bin')
 #ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/vicuna-13B-v1.5-16K-GGUF', model_type='llama', model_file='vicuna-13b-v1.5-16k.Q4_K_M.gguf')
 #ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/CodeUp-Llama-2-13B-Chat-HF-GGUF', model_type='llama', model_file='codeup-llama-2-13b-chat-hf.Q4_K_M.gguf')
 #ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/CodeLlama-13B-Instruct-GGUF', model_type='llama', model_file='codellama-13b-instruct.Q4_K_M.gguf')
 #ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/Mistral-7B-Instruct-v0.1-GGUF', model_type='mistral', model_file='mistral-7b-instruct-v0.1.Q4_K_M.gguf')
 #ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/Mistral-7B-OpenOrca-GGUF', model_type='mistral', model_file='mistral-7b-openorca.Q4_K_M.gguf')
-#gpt4all_model = GPT4All(model_name= "orca-mini-3b.ggmlv3.q4_0.bin", model_path="models/") # "ggml-mpt-7b-chat.bin"
 # Huggingface chat model
 #hf_checkpoint = 'jphme/phi-1_5_Wizard_Vicuna_uncensored'
@@ -128,7 +154,7 @@ def create_hf_model(model_name):
     return model, tokenizer, torch_device
-model, tokenizer, torch_device = create_hf_model(model_name = hf_checkpoint)
 # Vectorstore funcs
@@ -196,6 +222,17 @@ def create_prompt_templates():
     ### Response:"""
@@ -581,9 +618,6 @@ def create_final_prompt(inputs: Dict[str, str], instruction_prompt, content_prom
         #print("The question passed to the vector search is:")
         #print(new_question_kworded)
-        #docs_keep_as_doc, docs_content, docs_url = find_relevant_passages(new_question_kworded, k_val = 5, out_passages = 3,
-        #                                                                  vec_score_cut_off = 1.3, vec_weight = 1, tfidf_weight = 0.5, svm_weight = 1)
         docs_keep_as_doc, doc_df, docs_keep_out = hybrid_retrieval(new_question_kworded, vectorstore, embeddings, k_val = 5, out_passages = 2,
                                                                           vec_score_cut_off = 1, vec_weight = 1, bm25_weight = 1, svm_weight = 1)#,
                                                                           #vectorstore=globals()["vectorstore"], embeddings=globals()["embeddings"])
@@ -868,8 +902,8 @@ def produce_streaming_answer_chatbot_ctrans(history, full_prompt):
     print("The question is: ")
     print(full_prompt)
-    #tokens = ctrans_llm.tokenize(full_prompt)
     #import psutil
     #from loguru import logger
@@ -884,29 +918,13 @@ def produce_streaming_answer_chatbot_ctrans(history, full_prompt):
     #logger.debug(f"{cpu_count=}")
     # Pull the generated text from the streamer, and update the model output.
-    config = GenerationConfig(reset=True)
     history[-1][1] = ""
-    for new_text in ctrans_generate(prompt=full_prompt, config=config):
-        if new_text == None: new_text = ""
-        history[-1][1] += new_text
         yield history
-@dataclass
-class GenerationConfig:
-    temperature: float = temperature
-    top_k: int = top_k
-    top_p: float = top_p
-    repetition_penalty: float = repetition_penalty
-    last_n_tokens: int = last_n_tokens
-    max_new_tokens: int = max_new_tokens
-    #seed: int = 42
-    reset: bool = reset
-    stream: bool = stream
-    threads: int = threads
-    batch_size:int = batch_size
-    #context_length:int = context_length
-    #gpu_layers:int = gpu_layers
-    #stop: list[str] = field(default_factory=lambda: [stop_string])
 def ctrans_generate(
     prompt: str,

 # Model packages
 import torch
+torch.cuda.empty_cache()
 from threading import Thread
 from transformers import AutoTokenizer, pipeline, TextIteratorStreamer
 # Alternative model sources
 from gpt4all import GPT4All
+from ctransformers import AutoModelForCausalLM#, AutoTokenizer
 from dataclasses import asdict, dataclass
 import gradio as gr
+if torch.cuda.is_available():
+    torch_device = "cuda"
+    gpu_layers = 1
+else: torch_device =  "cpu"
 print("Running on device:", torch_device)
 threads = 8#torch.get_num_threads()
 print("CPU threads:", threads)
 threads: int = threads
 batch_size:int = 512
 context_length:int = 2048
+gpu_layers:int = 0#10#gpu_layers
 sample = True
+@dataclass
+class GenerationConfig:
+    temperature: float = temperature
+    top_k: int = top_k
+    top_p: float = top_p
+    repetition_penalty: float = repetition_penalty
+    last_n_tokens: int = last_n_tokens
+    max_new_tokens: int = max_new_tokens
+    #seed: int = 42
+    reset: bool = reset
+    stream: bool = stream
+    threads: int = threads
+    batch_size:int = batch_size
+    context_length:int = context_length
+    gpu_layers:int = gpu_layers
+    #stop: list[str] = field(default_factory=lambda: [stop_string])
 ## Highlight text constants
 hlt_chunk_size = 20
 hlt_strat = [" ", ".", "!", "?", ":", "\n\n", "\n", ","]
 # Used to pull out keywords from chat history to add to user queries behind the scenes
 kw_model = pipeline("feature-extraction", model="sentence-transformers/all-MiniLM-L6-v2")
 ## Chat models ##
 ctrans_llm = [] # Not leaded by default
 #ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/orca_mini_3B-GGML', model_type='llama', model_file='orca-mini-3b.ggmlv3.q4_0.bin')
+ctrans_llm = AutoModelForCausalLM.from_pretrained('juanjgit/orca_mini_3B-GGUF', model_type='llama', model_file='orca-mini-3b.q4_0.gguf', **asdict(GenerationConfig()))
 #ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/vicuna-13B-v1.5-16K-GGUF', model_type='llama', model_file='vicuna-13b-v1.5-16k.Q4_K_M.gguf')
 #ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/CodeUp-Llama-2-13B-Chat-HF-GGUF', model_type='llama', model_file='codeup-llama-2-13b-chat-hf.Q4_K_M.gguf')
 #ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/CodeLlama-13B-Instruct-GGUF', model_type='llama', model_file='codellama-13b-instruct.Q4_K_M.gguf')
 #ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/Mistral-7B-Instruct-v0.1-GGUF', model_type='mistral', model_file='mistral-7b-instruct-v0.1.Q4_K_M.gguf')
 #ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/Mistral-7B-OpenOrca-GGUF', model_type='mistral', model_file='mistral-7b-openorca.Q4_K_M.gguf')
+#ctokenizer = AutoTokenizer.from_pretrained(ctrans_llm)
 # Huggingface chat model
 #hf_checkpoint = 'jphme/phi-1_5_Wizard_Vicuna_uncensored'
     return model, tokenizer, torch_device
+#model, tokenizer, torch_device = create_hf_model(model_name = hf_checkpoint)
 # Vectorstore funcs
     ### Response:"""
+    instruction_prompt_template_orca_input = """
+    ### System:
+    You are an AI assistant that follows instruction extremely well. Help as much as you can.
+    ### User:
+    Answer the QUESTION using information from the following input.
+    ### Input:
+    {summaries}
+    QUESTION: {question}
+    ### Response:"""
         #print("The question passed to the vector search is:")
         #print(new_question_kworded)
         docs_keep_as_doc, doc_df, docs_keep_out = hybrid_retrieval(new_question_kworded, vectorstore, embeddings, k_val = 5, out_passages = 2,
                                                                           vec_score_cut_off = 1, vec_weight = 1, bm25_weight = 1, svm_weight = 1)#,
                                                                           #vectorstore=globals()["vectorstore"], embeddings=globals()["embeddings"])
     print("The question is: ")
     print(full_prompt)
+    tokens = ctrans_llm.tokenize(full_prompt)
     #import psutil
     #from loguru import logger
     #logger.debug(f"{cpu_count=}")
     # Pull the generated text from the streamer, and update the model output.
+    #config = GenerationConfig(reset=True)
     history[-1][1] = ""
+    for new_text in ctrans_llm.generate(tokens, top_k=top_k, temperature=temperature, repetition_penalty=repetition_penalty): #ctrans_generate(prompt=tokens, config=config):
+        if new_text == None: new_text =  ""
+        history[-1][1] += ctrans_llm.detokenize(new_text) #new_text
         yield history
 def ctrans_generate(
     prompt: str,

requirements.txt CHANGED Viewed

@@ -17,7 +17,7 @@ gradio
 gradio_client==0.2.7
 python-docx
 gpt4all
-ctransformers
 keybert
 span_marker
 gensim

 gradio_client==0.2.7
 python-docx
 gpt4all
+ctransformers[cuda]
 keybert
 span_marker
 gensim