Spaces:

Mythus
/

BooksCheating

Paused

App Files Files Community

Mythus commited on May 17, 2024

Commit

22c5eeb

verified ·

1 Parent(s): fd2c532

Upload 7 files

Browse files

Files changed (7) hide show

LICENSE +21 -0
README.md +26 -12
app_chat.py +112 -0
constants.py +17 -0
langchain_utils.py +103 -0
requirements.txt +6 -0
search_indexing.py +45 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Elton Vieira
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,12 +1,26 @@
----
-title: BooksCheating
-emoji: 📈
-colorFrom: red
-colorTo: purple
-sdk: streamlit
-sdk_version: 1.34.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+![image](https://github.com/ergv03/chat-with-pdf-llm/assets/23053920/969edf03-4451-4909-98d9-601d92a17e83)
+## Overview:
+Simple web-based chat app, built using [Streamlit](https://streamlit.io/) and [Langchain](https://python.langchain.com/). The app backend follows the Retrieval Augmented Generation (RAG) framework.
+Allows the user to provide a list of PDFs, and ask questions to a LLM (today only OpenAI GPT is implemented) that can be answered by these PDF documents.
+User needs to provide their own OpenAI API key.
+## Instalation:
+Just clone the repo and install the requirements using ```pip install -r requirements.txt```
+## How to run locally:
+Run ```streamlit run chat_app.py``` in your terminal.
+Add the URLs of the PDF documents that are relevant to your queries, and start chatting with the bot.
+## How it works:
+The provided PDFs will be downloaded and properly split into chunks, and finally embedding vectors for each chunk will be generated using OpenAI service. These vectors are then indexed using FAISS, and can be quickly retrieved.
+As the user interacts with the bot, new relevant document chunks/snippets are retrieved and added to the session memory, alongside the past few messages. These snippets and messages are part of the prompt sent to the LLM; this way, the model will have as context not just the latest message and retrieved snippet, but past ones as well.

app_chat.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import streamlit as st
+import os
+from constants import search_number_messages
+from langchain_utils import initialize_chat_conversation
+from search_indexing import download_and_index_pdf
+import re
+def remove_url(url_to_remove):
+    """
+    Remove URLs from the session_state. Triggered by the respective button
+    """
+    if url_to_remove in st.session_state.urls:
+        st.session_state.urls.remove(url_to_remove)
+# Page title
+st.set_page_config(page_title='Talk with PDFs using LLMs - Beta')
+st.title('Talk with PDFs using LLMs - (Beta)')
+# Initialize the faiss_index key in the session state. This can be used to avoid having to download and embed the same PDF
+# every time the user asks a question
+if 'faiss_index' not in st.session_state:
+    st.session_state['faiss_index'] = {
+        'indexed_urls': [],
+        'index': None
+    }
+# Initialize conversation memory used by Langchain
+if 'conversation_memory' not in st.session_state:
+    st.session_state['conversation_memory'] = None
+# Initialize chat history used by StreamLit (for display purposes)
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+# Store the URLs added by the user in the UI
+if 'urls' not in st.session_state:
+    st.session_state.urls = []
+with st.sidebar:
+    openai_api_key = st.text_input('Step 1 - OpenAI API Key:', type='password')
+    # Add/Remove URLs form
+    with st.form('urls-form', clear_on_submit=True):
+        url = st.text_input('Step 2 - URLs to relevant PDFs: ')
+        add_url_button = st.form_submit_button('Add')
+        if add_url_button:
+            if url not in st.session_state.urls:
+                st.session_state.urls.append(url)
+    # Display a container with the URLs added by the user so far
+    with st.container():
+        if st.session_state.urls:
+            st.header('URLs added:')
+            for url in st.session_state.urls:
+                st.write(url)
+                st.button(label='Remove', key=f"Remove {url}", on_click=remove_url, kwargs={'url_to_remove': url})
+                st.divider()
+# Display chat messages from history on app rerun
+for message in st.session_state.messages:
+    with st.chat_message(message["role"]):
+        st.markdown(message["content"])
+# React to user input
+if query_text := st.chat_input("Your message"):
+    os.environ['OPENAI_API_KEY'] = openai_api_key
+    # Display user message in chat message container, and append to session state
+    st.chat_message("user").markdown(query_text)
+    st.session_state.messages.append({"role": "user", "content": query_text})
+    # Check if FAISS index already exists, or if it needs to be created as it includes new URLs
+    session_urls = st.session_state.urls
+    if st.session_state['faiss_index']['index'] is None or set(st.session_state['faiss_index']['indexed_urls']) != set(session_urls):
+        st.session_state['faiss_index']['indexed_urls'] = session_urls
+        with st.spinner('Downloading and indexing PDFs...'):
+            faiss_index = download_and_index_pdf(session_urls)
+            st.session_state['faiss_index']['index'] = faiss_index
+    else:
+        faiss_index = st.session_state['faiss_index']['index']
+    # Check if conversation memory has already been initialized and is part of the session state
+    if st.session_state['conversation_memory'] is None:
+        conversation = initialize_chat_conversation(faiss_index)
+        st.session_state['conversation_memory'] = conversation
+    else:
+        conversation = st.session_state['conversation_memory']
+    # Search PDF snippets using the last few user messages
+    user_messages_history = [message['content'] for message in st.session_state.messages[-search_number_messages:] if message['role'] == 'user']
+    user_messages_history = '\n'.join(user_messages_history)
+    with st.spinner('Querying OpenAI GPT...'):
+        response = conversation.predict(input=query_text, user_messages_history=user_messages_history)
+    # Display assistant response in chat message container
+    with st.chat_message("assistant"):
+        st.markdown(response)
+        snippet_memory = conversation.memory.memories[1]
+        for page_number, snippet in zip(snippet_memory.pages, snippet_memory.snippets):
+            with st.expander(f'Snippet from page {page_number + 1}'):
+                # Remove the <START> and <END> tags from the snippets before displaying them
+                snippet = re.sub("<START_SNIPPET_PAGE_\d+>", '', snippet)
+                snippet = re.sub("<END_SNIPPET_PAGE_\d+>", '', snippet)
+                st.markdown(snippet)
+    # Add assistant response to chat history
+    st.session_state.messages.append({"role": "assistant", "content": response})

constants.py ADDED Viewed

	@@ -0,0 +1,17 @@

+# Number of snippets that will be added to the prompt. Too many snippets and you risk both the prompt going over the
+# token limit, and the model not being able to find the correct answer
+prompt_number_snippets = 3
+# GPT related constants
+gpt_model_to_use = 'gpt-4'
+gpt_max_tokens = 1000
+# Number of past user messages that will be used to search relevant snippets
+search_number_messages = 4
+# PDF Chunking constants
+chunk_size = 500
+chunk_overlap = 50
+# Number of snippets to be retrieved by FAISS
+number_snippets_to_retrieve = 3

langchain_utils.py ADDED Viewed

	@@ -0,0 +1,103 @@

+from langchain import FAISS
+from langchain.chat_models import ChatOpenAI
+from langchain.chains import ConversationChain
+from langchain.memory import ConversationBufferWindowMemory, CombinedMemory
+from langchain import PromptTemplate
+from constants import prompt_number_snippets, gpt_model_to_use, gpt_max_tokens
+from search_indexing import search_faiss_index
+class SnippetsBufferWindowMemory(ConversationBufferWindowMemory):
+    """
+    MemoryBuffer used to hold the document snippets. Inherits from ConversationBufferWindowMemory, and overwrites the
+    load_memory_variables method
+    """
+    index: FAISS = None
+    pages: list = []
+    memory_key = 'snippets'
+    snippets: list = []
+    def __init__(self, *args, **kwargs):
+        ConversationBufferWindowMemory.__init__(self, *args, **kwargs)
+        self.index = kwargs['index']
+    def load_memory_variables(self, inputs) -> dict:
+        """
+        Based on the user inputs, search the index and add the similar snippets to memory (but only if they aren't in the
+        memory already)
+        """
+        # Search snippets
+        similar_snippets = search_faiss_index(self.index, inputs['user_messages_history'])
+        # In order to respect the buffer size and make its pruning work, need to reverse the list, and then un-reverse it later
+        # This way, the most relevant snippets are kept at the start of the list
+        self.snippets = [snippet for snippet in reversed(self.snippets)]
+        self.pages = [page for page in reversed(self.pages)]
+        for snippet in similar_snippets:
+            page_number = snippet.metadata['page']
+            # Load into memory only new snippets
+            snippet_to_add = f"The following snippet was extracted from the following document: "
+            if snippet.metadata['title'] == snippet.metadata['source']:
+                snippet_to_add += f"{snippet.metadata['source']}\n"
+            else:
+                snippet_to_add += f"[{snippet.metadata['title']}]({snippet.metadata['source']})\n"
+            snippet_to_add += f"<START_SNIPPET_PAGE_{page_number + 1}>\n"
+            snippet_to_add += f"{snippet.page_content}\n"
+            snippet_to_add += f"<END_SNIPPET_PAGE_{page_number + 1}>\n"
+            if snippet_to_add not in self.snippets:
+                self.pages.append(page_number)
+                self.snippets.append(snippet_to_add)
+        # Reverse list of snippets and pages, in order to keep the most relevant at the top
+        # Also prune the list to keep the buffer within the define size (k)
+        self.snippets = [snippet for snippet in reversed(self.snippets)][:self.k]
+        self.pages = [page for page in reversed(self.pages)][:self.k]
+        to_return = ''.join(self.snippets)
+        return {'snippets': to_return}
+def construct_conversation(prompt: str, llm, memory) -> ConversationChain:
+    """
+    Construct a ConversationChain object
+    """
+    prompt = PromptTemplate.from_template(
+        template=prompt,
+    )
+    conversation = ConversationChain(
+        llm=llm,
+        memory=memory,
+        verbose=False,
+        prompt=prompt
+    )
+    return conversation
+def initialize_chat_conversation(index: FAISS,
+                                 model_to_use: str = gpt_model_to_use,
+                                 max_tokens: int = gpt_max_tokens) -> ConversationChain:
+    prompt_header = """You are an expert, tasked with helping customers with their questions. They will ask you questions and provide technical snippets that may or may not contain the answer, and it's your job to find the answer if possible, while taking into account the entire conversation context.
+    The following snippets can be used to help you answer the questions:
+    {snippets}
+    The following is a friendly conversation between a customer and you. Please answer the customer's needs based on the provided snippets and the conversation history. Make sure to take the previous messages in consideration, as they contain additional context.
+    If the provided snippets don't include the answer, please say so, and don't try to make up an answer instead. Include in your reply the title of the document and the page from where your answer is coming from, if applicable.
+    {history}
+    Customer: {input}
+    """
+    llm = ChatOpenAI(model_name=model_to_use, max_tokens=max_tokens)
+    conv_memory = ConversationBufferWindowMemory(k=3, input_key="input")
+    snippets_memory = SnippetsBufferWindowMemory(k=prompt_number_snippets, index=index, memory_key='snippets', input_key="snippets")
+    memory = CombinedMemory(memories=[conv_memory, snippets_memory])
+    conversation = construct_conversation(prompt_header, llm, memory)
+    return conversation

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+faiss-cpu==1.7.4
+langchain==0.0.248
+openai==0.27.7
+streamlit==1.25.0
+pypdfium2==4.18.0
+tiktoken==0.4.0

search_indexing.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from langchain import FAISS
+from langchain.document_loaders import PyPDFium2Loader
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.text_splitter import CharacterTextSplitter
+import pypdfium2 as pdfium
+from constants import chunk_size, chunk_overlap, number_snippets_to_retrieve
+def download_and_index_pdf(urls: list[str]) -> FAISS:
+    """
+    Download and index a list of PDFs based on the URLs
+    """
+    def __update_metadata(pages, url):
+        """
+        Add to the document metadata the title and original URL
+        """
+        for page in pages:
+            pdf = pdfium.PdfDocument(page.metadata['source'])
+            title = pdf.get_metadata_dict().get('Title', url)
+            page.metadata['source'] = url
+            page.metadata['title'] = title
+        return pages
+    all_pages = []
+    for url in urls:
+        loader = PyPDFium2Loader(url)
+        splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+        pages = loader.load_and_split(splitter)
+        pages = __update_metadata(pages, url)
+        all_pages += pages
+    faiss_index = FAISS.from_documents(all_pages, OpenAIEmbeddings())
+    return faiss_index
+def search_faiss_index(faiss_index: FAISS, query: str, top_k: int = number_snippets_to_retrieve) -> list:
+    """
+    Search a FAISS index, using the passed query
+    """
+    docs = faiss_index.similarity_search(query, k=top_k)
+    return docs