Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,16 +1,15 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
-
from langchain_openai import ChatOpenAI
|
| 3 |
import os
|
| 4 |
import dotenv
|
| 5 |
-
from langchain_community.document_loaders import
|
| 6 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 7 |
from langchain_chroma import Chroma
|
| 8 |
-
from langchain_openai import OpenAIEmbeddings
|
| 9 |
from langchain.chains.combine_documents import create_stuff_documents_chain
|
| 10 |
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
|
| 11 |
from langchain_core.messages import HumanMessage, AIMessage
|
| 12 |
from langchain.memory import ConversationBufferMemory
|
| 13 |
-
|
| 14 |
|
| 15 |
# Set page config
|
| 16 |
st.set_page_config(page_title="Enterprise document search + chat", layout="wide")
|
|
@@ -27,153 +26,156 @@ with st.sidebar:
|
|
| 27 |
|
| 28 |
# Main app logic
|
| 29 |
if "OPENAI_API_KEY" in os.environ:
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
elif file_extension == '.pdf':
|
| 53 |
-
loader = PyPDFLoader(temp_file_path)
|
| 54 |
-
elif file_extension == '.csv':
|
| 55 |
-
loader = CSVLoader(temp_file_path)
|
| 56 |
-
elif file_extension in ['.ppt', '.pptx']:
|
| 57 |
-
loader = UnstructuredPowerPointLoader(temp_file_path)
|
| 58 |
-
elif file_extension in ['.doc', '.docx']:
|
| 59 |
-
loader = UnstructuredWordDocumentLoader(temp_file_path)
|
| 60 |
-
elif file_extension in ['.xls', '.xlsx']:
|
| 61 |
-
loader = UnstructuredExcelLoader(temp_file_path)
|
| 62 |
-
else:
|
| 63 |
-
os.unlink(temp_file_path)
|
| 64 |
-
raise ValueError(f"Unsupported file type: {file_extension}")
|
| 65 |
-
|
| 66 |
-
documents = loader.load()
|
| 67 |
os.unlink(temp_file_path)
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
| 70 |
-
all_splits = text_splitter.split_documents(
|
| 71 |
-
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
|
| 72 |
-
vectorstore = Chroma.from_documents(documents=all_splits, embedding=embeddings)
|
| 73 |
-
retriever = vectorstore.as_retriever(k=4)
|
| 74 |
|
| 75 |
-
|
| 76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
-
|
| 79 |
-
|
| 80 |
|
| 81 |
-
|
|
|
|
| 82 |
|
| 83 |
-
|
| 84 |
|
| 85 |
-
|
| 86 |
|
| 87 |
-
|
| 88 |
|
| 89 |
-
|
| 90 |
|
| 91 |
-
|
| 92 |
|
| 93 |
-
|
| 94 |
|
| 95 |
-
|
| 96 |
|
| 97 |
-
|
| 98 |
|
| 99 |
-
|
| 100 |
-
{context}
|
| 101 |
-
</context>
|
| 102 |
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
"system",
|
| 111 |
-
SYSTEM_TEMPLATE,
|
| 112 |
-
),
|
| 113 |
-
MessagesPlaceholder(variable_name="chat_history"),
|
| 114 |
-
MessagesPlaceholder(variable_name="messages"),
|
| 115 |
-
]
|
| 116 |
-
)
|
| 117 |
|
| 118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
-
|
| 121 |
-
# else:
|
| 122 |
-
# st.warning("Please Upload File to Continue")
|
| 123 |
|
| 124 |
-
|
| 125 |
-
with st.spinner("Initializing Assistant..."):
|
| 126 |
-
retriever, document_chain = initialize_components()
|
| 127 |
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
|
| 132 |
-
|
| 133 |
-
|
|
|
|
| 134 |
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
st.session_state.messages = []
|
| 138 |
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
st.markdown(message["content"])
|
| 143 |
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
# Add user message to chat history
|
| 149 |
-
st.session_state.messages.append({"role": "user", "content": prompt})
|
| 150 |
|
| 151 |
-
|
| 152 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
-
|
| 155 |
-
|
| 156 |
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
|
| 168 |
-
#
|
| 169 |
-
|
| 170 |
-
message_placeholder.markdown(full_response)
|
| 171 |
|
| 172 |
-
|
| 173 |
-
|
| 174 |
|
| 175 |
-
|
| 176 |
-
st.
|
| 177 |
|
| 178 |
else:
|
| 179 |
st.warning("Please enter your OpenAI API Key in the sidebar to start the chatbot.")
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
+
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
| 3 |
import os
|
| 4 |
import dotenv
|
| 5 |
+
from langchain_community.document_loaders import TextLoader, PyPDFLoader, CSVLoader, UnstructuredPowerPointLoader, UnstructuredWordDocumentLoader, UnstructuredExcelLoader
|
| 6 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 7 |
from langchain_chroma import Chroma
|
|
|
|
| 8 |
from langchain.chains.combine_documents import create_stuff_documents_chain
|
| 9 |
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
|
| 10 |
from langchain_core.messages import HumanMessage, AIMessage
|
| 11 |
from langchain.memory import ConversationBufferMemory
|
| 12 |
+
import tempfile
|
| 13 |
|
| 14 |
# Set page config
|
| 15 |
st.set_page_config(page_title="Enterprise document search + chat", layout="wide")
|
|
|
|
| 26 |
|
| 27 |
# Main app logic
|
| 28 |
if "OPENAI_API_KEY" in os.environ:
|
| 29 |
+
st.header('Multiple File Upload')
|
| 30 |
+
uploaded_files = st.file_uploader('Upload your files', accept_multiple_files=True, type=['txt', 'pdf', 'csv', 'ppt', 'doc', 'xls', 'pptx', 'xlsx'])
|
| 31 |
+
|
| 32 |
+
def load_file(file):
|
| 33 |
+
file_extension = os.path.splitext(file.name)[1].lower()
|
| 34 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file:
|
| 35 |
+
temp_file.write(file.getvalue())
|
| 36 |
+
temp_file_path = temp_file.name
|
| 37 |
+
|
| 38 |
+
if file_extension == '.txt':
|
| 39 |
+
loader = TextLoader(temp_file_path)
|
| 40 |
+
elif file_extension == '.pdf':
|
| 41 |
+
loader = PyPDFLoader(temp_file_path)
|
| 42 |
+
elif file_extension == '.csv':
|
| 43 |
+
loader = CSVLoader(temp_file_path)
|
| 44 |
+
elif file_extension in ['.ppt', '.pptx']:
|
| 45 |
+
loader = UnstructuredPowerPointLoader(temp_file_path)
|
| 46 |
+
elif file_extension in ['.doc', '.docx']:
|
| 47 |
+
loader = UnstructuredWordDocumentLoader(temp_file_path)
|
| 48 |
+
elif file_extension in ['.xls', '.xlsx']:
|
| 49 |
+
loader = UnstructuredExcelLoader(temp_file_path)
|
| 50 |
+
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
os.unlink(temp_file_path)
|
| 52 |
+
raise ValueError(f"Unsupported file type: {file_extension}")
|
| 53 |
+
|
| 54 |
+
documents = loader.load()
|
| 55 |
+
os.unlink(temp_file_path)
|
| 56 |
+
return documents
|
| 57 |
+
|
| 58 |
+
# Process uploaded files
|
| 59 |
+
if uploaded_files:
|
| 60 |
+
all_documents = []
|
| 61 |
+
for file in uploaded_files:
|
| 62 |
+
all_documents.extend(load_file(file))
|
| 63 |
+
|
| 64 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
| 65 |
+
all_splits = text_splitter.split_documents(all_documents)
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
+
# Initialize components
|
| 68 |
+
@st.cache_resource
|
| 69 |
+
def initialize_components(_all_splits):
|
| 70 |
+
dotenv.load_dotenv()
|
| 71 |
+
chat = ChatOpenAI(model="gpt-3.5-turbo-1106", temperature=0.2)
|
| 72 |
+
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
|
| 73 |
+
vectorstore = Chroma.from_documents(documents=_all_splits, embedding=embeddings)
|
| 74 |
+
retriever = vectorstore.as_retriever(k=4)
|
| 75 |
|
| 76 |
+
SYSTEM_TEMPLATE = """
|
| 77 |
+
You are an advanced AI assistant designed for document search and chatbot functionality. Your primary functions are:
|
| 78 |
|
| 79 |
+
1. Process and structure multiple documents in various formats, including:
|
| 80 |
+
.txt, .pdf, .csv, .ppt, .doc, .xls, .pptx, and .xlsx
|
| 81 |
|
| 82 |
+
2. Extract and organize information from these unstructured documents into a coherent, searchable format.
|
| 83 |
|
| 84 |
+
3. Retrieve relevant information from the processed documents based on user queries.
|
| 85 |
|
| 86 |
+
4. Act as a chatbot, engaging in conversations about the content of the documents.
|
| 87 |
|
| 88 |
+
5. Provide accurate and contextual responses to user questions, drawing solely from the information contained within the processed documents.
|
| 89 |
|
| 90 |
+
6. If a user's question is not related to the content of the provided documents, politely inform them that you can only answer questions based on the information in the given documents.
|
| 91 |
|
| 92 |
+
7. When answering, cite the specific document or section where the information was found, if possible.
|
| 93 |
|
| 94 |
+
8. If there's ambiguity in a query, ask for clarification to ensure you provide the most relevant information.
|
| 95 |
|
| 96 |
+
9. Maintain confidentiality and do not share or discuss information from one user's documents with other users.
|
| 97 |
|
| 98 |
+
Remember, your knowledge is limited to the content of the documents you've been given to process. Do not provide information or answer questions that are outside the scope of these documents. Always strive for accuracy and relevance in your responses.
|
|
|
|
|
|
|
| 99 |
|
| 100 |
+
<context>
|
| 101 |
+
{context}
|
| 102 |
+
</context>
|
| 103 |
|
| 104 |
+
Chat History:
|
| 105 |
+
{chat_history}
|
| 106 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
+
question_answering_prompt = ChatPromptTemplate.from_messages(
|
| 109 |
+
[
|
| 110 |
+
(
|
| 111 |
+
"system",
|
| 112 |
+
SYSTEM_TEMPLATE,
|
| 113 |
+
),
|
| 114 |
+
MessagesPlaceholder(variable_name="chat_history"),
|
| 115 |
+
MessagesPlaceholder(variable_name="messages"),
|
| 116 |
+
]
|
| 117 |
+
)
|
| 118 |
|
| 119 |
+
document_chain = create_stuff_documents_chain(chat, question_answering_prompt)
|
|
|
|
|
|
|
| 120 |
|
| 121 |
+
return retriever, document_chain
|
|
|
|
|
|
|
| 122 |
|
| 123 |
+
# Load components
|
| 124 |
+
with st.spinner("Initializing Assistant..."):
|
| 125 |
+
retriever, document_chain = initialize_components(all_splits)
|
| 126 |
|
| 127 |
+
# Initialize memory for each session
|
| 128 |
+
if "memory" not in st.session_state:
|
| 129 |
+
st.session_state.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
|
| 130 |
|
| 131 |
+
# Chat interface
|
| 132 |
+
st.subheader("Chat with Assistant")
|
|
|
|
| 133 |
|
| 134 |
+
# Initialize chat history
|
| 135 |
+
if "messages" not in st.session_state:
|
| 136 |
+
st.session_state.messages = []
|
|
|
|
| 137 |
|
| 138 |
+
# Display chat messages from history on app rerun
|
| 139 |
+
for message in st.session_state.messages:
|
| 140 |
+
with st.chat_message(message["role"]):
|
| 141 |
+
st.markdown(message["content"])
|
|
|
|
|
|
|
| 142 |
|
| 143 |
+
# React to user input
|
| 144 |
+
if prompt := st.chat_input("What would you like to know about Document?"):
|
| 145 |
+
# Display user message in chat message container
|
| 146 |
+
st.chat_message("user").markdown(prompt)
|
| 147 |
+
# Add user message to chat history
|
| 148 |
+
st.session_state.messages.append({"role": "user", "content": prompt})
|
| 149 |
|
| 150 |
+
with st.chat_message("assistant"):
|
| 151 |
+
message_placeholder = st.empty()
|
| 152 |
|
| 153 |
+
# Retrieve relevant documents
|
| 154 |
+
docs = retriever.get_relevant_documents(prompt)
|
| 155 |
+
|
| 156 |
+
# Generate response
|
| 157 |
+
response = document_chain.invoke(
|
| 158 |
+
{
|
| 159 |
+
"context": docs,
|
| 160 |
+
"chat_history": st.session_state.memory.load_memory_variables({})["chat_history"],
|
| 161 |
+
"messages": [
|
| 162 |
+
HumanMessage(content=prompt)
|
| 163 |
+
],
|
| 164 |
+
}
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
# The response is already a string, so we can use it directly
|
| 168 |
+
full_response = response
|
| 169 |
+
message_placeholder.markdown(full_response)
|
| 170 |
|
| 171 |
+
# Add assistant response to chat history
|
| 172 |
+
st.session_state.messages.append({"role": "assistant", "content": full_response})
|
|
|
|
| 173 |
|
| 174 |
+
# Update memory
|
| 175 |
+
st.session_state.memory.save_context({"input": prompt}, {"output": full_response})
|
| 176 |
|
| 177 |
+
else:
|
| 178 |
+
st.warning("Please upload files to continue.")
|
| 179 |
|
| 180 |
else:
|
| 181 |
st.warning("Please enter your OpenAI API Key in the sidebar to start the chatbot.")
|