pdf-chatbot

Sleeping

App Files Files Community

MatteoScript commited on Jan 29, 2024

Commit

209f685

verified ·

1 Parent(s): 1a34146

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -3

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import gradio as gr
 import os
 from langchain.document_loaders import PyPDFLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.vectorstores import Chroma
@@ -9,30 +10,39 @@ from langchain.llms import HuggingFacePipeline
 from langchain.chains import ConversationChain
 from langchain.memory import ConversationBufferMemory
 from langchain.llms import HuggingFaceHub
 from pathlib import Path
 import chromadb
 from transformers import AutoTokenizer
 import transformers
 import torch
 import tqdm
 import accelerate
 llm_name0 = "mistralai/Mixtral-8x7B-Instruct-v0.1"
-list_llm = [llm_name0]
 list_llm_simple = [os.path.basename(llm) for llm in list_llm]
 # Load PDF document and create doc splits
 def load_doc(list_file_path, chunk_size, chunk_overlap):
     loaders = [PyPDFLoader(x) for x in list_file_path]
     pages = []
     for loader in loaders:
         pages.extend(loader.load())
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size = chunk_size,
         chunk_overlap = chunk_overlap)
     doc_splits = text_splitter.split_documents(pages)
     return doc_splits
 # Create vector database
 def create_db(splits, collection_name):
     embedding = HuggingFaceEmbeddings()
@@ -42,32 +52,38 @@ def create_db(splits, collection_name):
         embedding=embedding,
         client=new_client,
         collection_name=collection_name,
     )
     return vectordb
 # Load vector database
 def load_db():
     embedding = HuggingFaceEmbeddings()
     vectordb = Chroma(
         embedding_function=embedding)
     return vectordb
 # Initialize langchain LLM chain
 def initialize_llmchain(llm_model, temperature, max_tokens, top_k, vector_db, progress=gr.Progress()):
     progress(0.1, desc="Initializing HF tokenizer...")
     progress(0.5, desc="Initializing HF Hub...")
     if llm_model == "mistralai/Mixtral-8x7B-Instruct-v0.1":
         llm = HuggingFaceHub(
             repo_id=llm_model,
             model_kwargs={"temperature": temperature, "max_new_tokens": max_tokens, "top_k": top_k, "load_in_8bit": True}
         )
     progress(0.75, desc="Defining buffer memory...")
     memory = ConversationBufferMemory(
         memory_key="chat_history",
         output_key='answer',
         return_messages=True
     )
     retriever=vector_db.as_retriever()
     progress(0.8, desc="Defining retrieval chain...")
     qa_chain = ConversationalRetrievalChain.from_llm(
@@ -75,27 +91,42 @@ def initialize_llmchain(llm_model, temperature, max_tokens, top_k, vector_db, pr
         retriever=retriever,
         chain_type="stuff",
         memory=memory,
         return_source_documents=True,
     )
     progress(0.9, desc="Done!")
     return qa_chain
 def initialize_database(list_file_obj, chunk_size, chunk_overlap, progress=gr.Progress()):
     list_file_path = [x.name for x in list_file_obj if x is not None]
     collection_name = Path(list_file_path[0]).stem
     progress(0.25, desc="Loading document...")
     doc_splits = load_doc(list_file_path, chunk_size, chunk_overlap)
     progress(0.5, desc="Generating vector database...")
     vector_db = create_db(doc_splits, collection_name)
     progress(0.9, desc="Done!")
     return vector_db, collection_name, "Complete!"
 def initialize_LLM(llm_option, llm_temperature, max_tokens, top_k, vector_db, progress=gr.Progress()):
     llm_name = list_llm[llm_option]
     print("llm_name: ",llm_name)
     qa_chain = initialize_llmchain(llm_name, llm_temperature, max_tokens, top_k, vector_db, progress)
     return qa_chain, "Complete!"
 def format_chat_history(message, chat_history):
     formatted_chat_history = []
     for user_message, bot_message in chat_history:
@@ -103,25 +134,39 @@ def format_chat_history(message, chat_history):
         formatted_chat_history.append(f"Assistant: {bot_message}")
     return formatted_chat_history
 def conversation(qa_chain, message, history):
     formatted_chat_history = format_chat_history(message, history)
     response = qa_chain({"question": message, "chat_history": formatted_chat_history})
     response_answer = response["answer"]
     response_sources = response["source_documents"]
     response_source1 = response_sources[0].page_content.strip()
     response_source2 = response_sources[1].page_content.strip()
     response_source1_page = response_sources[0].metadata["page"] + 1
     response_source2_page = response_sources[1].metadata["page"] + 1
     new_history = history + [(message, response_answer)]
     return qa_chain, gr.update(value=""), new_history, response_source1, response_source1_page, response_source2, response_source2_page
 def upload_file(file_obj):
     list_file_path = []
     for idx, file in enumerate(file_obj):
         file_path = file_obj.name
         list_file_path.append(file_path)
     return list_file_path
 def demo():
     with gr.Blocks(theme="base") as demo:
         vector_db = gr.State()
@@ -129,10 +174,16 @@ def demo():
         collection_name = gr.State()
         gr.Markdown(
-        """<center><h2>PDF-based chatbot (powered by LangChain and open-source LLMs)</center></h2>""")
         with gr.Tab("Step 1 - Document pre-processing"):
             with gr.Row():
                 document = gr.Files(height=100, file_count="multiple", file_types=["pdf"], interactive=True, label="Upload your PDF documents (single or multiple)")
             with gr.Row():
                 db_btn = gr.Radio(["ChromaDB"], label="Vector database type", value = "ChromaDB", type="index", info="Choose your vector database")
             with gr.Accordion("Advanced options - Document text splitter", open=False):
@@ -177,6 +228,7 @@ def demo():
                 clear_btn = gr.ClearButton([msg, chatbot])
         # Preprocessing events
         db_btn.click(initialize_database, \
             inputs=[document, slider_chunk_size, slider_chunk_overlap], \
             outputs=[vector_db, collection_name, db_progress])
@@ -202,5 +254,6 @@ def demo():
             queue=False)
     demo.queue().launch(debug=True)
 if __name__ == "__main__":
     demo()

 import gradio as gr
 import os
 from langchain.document_loaders import PyPDFLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.vectorstores import Chroma
 from langchain.chains import ConversationChain
 from langchain.memory import ConversationBufferMemory
 from langchain.llms import HuggingFaceHub
 from pathlib import Path
 import chromadb
 from transformers import AutoTokenizer
 import transformers
 import torch
 import tqdm
 import accelerate
 llm_name0 = "mistralai/Mixtral-8x7B-Instruct-v0.1"
+list_llm = [llm_name0, llm_name1, llm_name2, llm_name3, llm_name4, llm_name5, llm_name6, llm_name7, llm_name8]
 list_llm_simple = [os.path.basename(llm) for llm in list_llm]
 # Load PDF document and create doc splits
 def load_doc(list_file_path, chunk_size, chunk_overlap):
+    # Processing for one document only
+    # loader = PyPDFLoader(file_path)
+    # pages = loader.load()
     loaders = [PyPDFLoader(x) for x in list_file_path]
     pages = []
     for loader in loaders:
         pages.extend(loader.load())
+    # text_splitter = RecursiveCharacterTextSplitter(chunk_size = 600, chunk_overlap = 50)
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size = chunk_size,
         chunk_overlap = chunk_overlap)
     doc_splits = text_splitter.split_documents(pages)
     return doc_splits
 # Create vector database
 def create_db(splits, collection_name):
     embedding = HuggingFaceEmbeddings()
         embedding=embedding,
         client=new_client,
         collection_name=collection_name,
+        # persist_directory=default_persist_directory
     )
     return vectordb
 # Load vector database
 def load_db():
     embedding = HuggingFaceEmbeddings()
     vectordb = Chroma(
+        # persist_directory=default_persist_directory,
         embedding_function=embedding)
     return vectordb
 # Initialize langchain LLM chain
 def initialize_llmchain(llm_model, temperature, max_tokens, top_k, vector_db, progress=gr.Progress()):
     progress(0.1, desc="Initializing HF tokenizer...")
     progress(0.5, desc="Initializing HF Hub...")
+    # URL: https://github.com/langchain-ai/langchain/issues/6080
     if llm_model == "mistralai/Mixtral-8x7B-Instruct-v0.1":
         llm = HuggingFaceHub(
             repo_id=llm_model,
             model_kwargs={"temperature": temperature, "max_new_tokens": max_tokens, "top_k": top_k, "load_in_8bit": True}
         )
     progress(0.75, desc="Defining buffer memory...")
     memory = ConversationBufferMemory(
         memory_key="chat_history",
         output_key='answer',
         return_messages=True
     )
+    # retriever=vector_db.as_retriever(search_type="similarity", search_kwargs={'k': 3})
     retriever=vector_db.as_retriever()
     progress(0.8, desc="Defining retrieval chain...")
     qa_chain = ConversationalRetrievalChain.from_llm(
         retriever=retriever,
         chain_type="stuff",
         memory=memory,
+        # combine_docs_chain_kwargs={"prompt": your_prompt})
         return_source_documents=True,
+        # return_generated_question=True,
+        # verbose=True,
     )
     progress(0.9, desc="Done!")
     return qa_chain
+# Initialize database
 def initialize_database(list_file_obj, chunk_size, chunk_overlap, progress=gr.Progress()):
+    # Create list of documents (when valid)
+    #file_path = file_obj.name
     list_file_path = [x.name for x in list_file_obj if x is not None]
     collection_name = Path(list_file_path[0]).stem
+    # print('list_file_path: ', list_file_path)
+    # print('Collection name: ', collection_name)
     progress(0.25, desc="Loading document...")
+    # Load document and create splits
     doc_splits = load_doc(list_file_path, chunk_size, chunk_overlap)
+    # Create or load Vector database
     progress(0.5, desc="Generating vector database...")
+    # global vector_db
     vector_db = create_db(doc_splits, collection_name)
     progress(0.9, desc="Done!")
     return vector_db, collection_name, "Complete!"
 def initialize_LLM(llm_option, llm_temperature, max_tokens, top_k, vector_db, progress=gr.Progress()):
+    # print("llm_option",llm_option)
     llm_name = list_llm[llm_option]
     print("llm_name: ",llm_name)
     qa_chain = initialize_llmchain(llm_name, llm_temperature, max_tokens, top_k, vector_db, progress)
     return qa_chain, "Complete!"
 def format_chat_history(message, chat_history):
     formatted_chat_history = []
     for user_message, bot_message in chat_history:
         formatted_chat_history.append(f"Assistant: {bot_message}")
     return formatted_chat_history
 def conversation(qa_chain, message, history):
     formatted_chat_history = format_chat_history(message, history)
+    #print("formatted_chat_history",formatted_chat_history)
+    # Generate response using QA chain
     response = qa_chain({"question": message, "chat_history": formatted_chat_history})
     response_answer = response["answer"]
     response_sources = response["source_documents"]
     response_source1 = response_sources[0].page_content.strip()
     response_source2 = response_sources[1].page_content.strip()
+    # Langchain sources are zero-based
     response_source1_page = response_sources[0].metadata["page"] + 1
     response_source2_page = response_sources[1].metadata["page"] + 1
+    # print ('chat response: ', response_answer)
+    # print('DB source', response_sources)
+    # Append user message and response to chat history
     new_history = history + [(message, response_answer)]
+    # return gr.update(value=""), new_history, response_sources[0], response_sources[1]
     return qa_chain, gr.update(value=""), new_history, response_source1, response_source1_page, response_source2, response_source2_page
 def upload_file(file_obj):
     list_file_path = []
     for idx, file in enumerate(file_obj):
         file_path = file_obj.name
         list_file_path.append(file_path)
+    # print(file_path)
+    # initialize_database(file_path, progress)
     return list_file_path
 def demo():
     with gr.Blocks(theme="base") as demo:
         vector_db = gr.State()
         collection_name = gr.State()
         gr.Markdown(
+        """<center><h2>PDF-based chatbot (powered by LangChain and open-source LLMs)</center></h2>
+        <h3>Ask any questions about your PDF documents, along with follow-ups</h3>
+        <b>Note:</b> This AI assistant performs retrieval-augmented generation from your PDF documents. \
+        When generating answers, it takes past questions into account (via conversational memory), and includes document references for clarity purposes.</i>
+        <br><b>Warning:</b> This space uses the free CPU Basic hardware from Hugging Face. Some steps and LLM models used below (free inference endpoints) can take some time to generate an output.<br>
+        """)
         with gr.Tab("Step 1 - Document pre-processing"):
             with gr.Row():
                 document = gr.Files(height=100, file_count="multiple", file_types=["pdf"], interactive=True, label="Upload your PDF documents (single or multiple)")
+                # upload_btn = gr.UploadButton("Loading document...", height=100, file_count="multiple", file_types=["pdf"], scale=1)
             with gr.Row():
                 db_btn = gr.Radio(["ChromaDB"], label="Vector database type", value = "ChromaDB", type="index", info="Choose your vector database")
             with gr.Accordion("Advanced options - Document text splitter", open=False):
                 clear_btn = gr.ClearButton([msg, chatbot])
         # Preprocessing events
+        #upload_btn.upload(upload_file, inputs=[upload_btn], outputs=[document])
         db_btn.click(initialize_database, \
             inputs=[document, slider_chunk_size, slider_chunk_overlap], \
             outputs=[vector_db, collection_name, db_progress])
             queue=False)
     demo.queue().launch(debug=True)
 if __name__ == "__main__":
     demo()