Spaces:
No application file
No application file
File size: 9,096 Bytes
fcac63a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 |
"""
complete, functional RAG App
stores vectors in session state, or locally.
add function to display retrieved documents
"""
# import time
from datetime import datetime
# import openai
# import tiktoken
import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from html_templates import css, bot_template, user_template
from langchain.llms import HuggingFaceHub
import os
import numpy as np
import faiss_utils
from langchain_community.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
def merge_faiss_indices(index1, index2):
"""
Merge two FAISS indices into a new index, assuming both are of the same type and dimensionality.
Args:
index1 (faiss.Index): The first FAISS index.
index2 (faiss.Index): The second FAISS index.
Returns:
faiss.Index: A new FAISS index containing all vectors from index1 and index2.
"""
# Check if both indices are the same type
if type(index1) != type(index2):
raise ValueError("Indices are of different types")
# Check dimensionality
if index1.d != index2.d:
raise ValueError("Indices have different dimensionality")
# Determine type of indices
if isinstance(index1, FAISS.IndexFlatL2):
# Handle simple flat indices
d = index1.d
# Extract vectors from both indices
xb1 = FAISS.rev_swig_ptr(index1.xb.data(), index1.ntotal * d)
xb2 = FAISS.rev_swig_ptr(index2.xb.data(), index2.ntotal * d)
# Combine vectors
xb_combined = np.vstack((xb1, xb2))
# Create a new index and add combined vectors
new_index = FAISS.IndexFlatL2(d)
new_index.add(xb_combined)
return new_index
elif isinstance(index1, FAISS.IndexIVFFlat):
# Handle quantized indices (IndexIVFFlat)
d = index1.d
nlist = index1.nlist
quantizer = FAISS.IndexFlatL2(d) # Re-create the appropriate quantizer
# Create a new index with the same configuration
new_index = FAISS.IndexIVFFlat(quantizer, d, nlist, FAISS.METRIC_L2)
# If the indices are already trained, you can directly add the vectors
# Otherwise, you may need to train new_index using a representative subset of vectors
vecs1 = FAISS.rev_swig_ptr(index1.xb.data(), index1.ntotal * d)
vecs2 = FAISS.rev_swig_ptr(index2.xb.data(), index2.ntotal * d)
new_index.add(vecs1)
new_index.add(vecs2)
return new_index
else:
raise TypeError("Index type not supported for merging in this function")
def get_pdf_text(pdf_docs):
text = ""
for pdf in pdf_docs:
pdf_reader = PdfReader(pdf)
for page in pdf_reader.pages:
text += page.extract_text()
return text
def get_text_chunks(text):
text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size=1000,
chunk_overlap=200,
length_function=len
)
chunks = text_splitter.split_text(text)
return chunks
def get_faiss_vectorstore(text_chunks):
if sst.openai:
my_embeddings = OpenAIEmbeddings()
else:
my_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
vectorstore = FAISS.from_texts(texts=text_chunks, embedding=my_embeddings)
return vectorstore
def get_conversation_chain(vectorstore):
if sst.openai:
llm = ChatOpenAI()
else:
llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature": 0.5, "max_length": 512})
memory = ConversationBufferMemory(
memory_key='chat_history', return_messages=True)
conversation_chain = ConversationalRetrievalChain.from_llm(
llm=llm,
retriever=vectorstore.as_retriever(),
memory=memory
)
return conversation_chain
def handle_userinput(user_question):
response = sst.conversation({'question': user_question})
sst.chat_history = response['chat_history']
for i, message in enumerate(sst.chat_history):
# Display user message
if i % 2 == 0:
st.write(user_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
else:
print(message)
# Display AI response
st.write(bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
# Display source document information if available in the message
if hasattr(message, 'source') and message.source:
st.write(f"Source Document: {message.source}", unsafe_allow_html=True)
if True:
BASE_URL = "https://api.vectara.io/v1"
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
OPENAI_ORG_ID = os.environ["OPENAI_ORG_ID"]
PINECONE_API_KEY = os.environ["PINECONE_API_KEY_LCBIM"]
HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]
VECTARA_API_KEY = os.environ["VECTARA_API_KEY"]
VECTARA_CUSTOMER_ID = os.environ["VECTARA_CUSTOMER_ID"]
headers = {"Authorization": f"Bearer {VECTARA_API_KEY}", "Content-Type": "application/json"}
def main():
st.set_page_config(page_title="Anna Seiler Haus KI-Assistent", page_icon=":hospital:")
st.write(css, unsafe_allow_html=True)
if "conversation" not in sst:
sst.conversation = None
if "chat_history" not in sst:
sst.chat_history = None
if "page" not in sst:
sst.page = "home"
if "openai" not in sst:
sst.openai = True
if "login" not in sst:
sst.login = False
if 'submitted_user_query' not in sst:
sst.submitted_user_query = ''
if 'submitted_user_safe' not in sst:
sst.submitted_user_safe = ''
if 'submitted_user_load' not in sst:
sst.submitted_user_load = ''
def submit_user_query():
sst.submitted_user_query = sst.widget_user_query
sst.widget_user_query = ''
def submit_user_safe():
sst.submitted_user_safe = sst.widget_user_safe
sst.widget_user_safe = ''
if "vectorstore" in sst:
# faiss_name = str(datetime.now().strftime("%Y%m%d%H%M%S")) + "faiss_index"
faiss_utils.save_local(sst.vectorstore, path=sst.submitted_user_safe)
st.sidebar.success("saved")
else:
st.sidebar.warning("No embeddings to save. Please process documents first.")
def submit_user_load():
sst.submitted_user_load = sst.widget_user_load
sst.widget_user_load = ''
if os.path.exists(sst.submitted_user_load):
new_db = faiss_utils.load_vectorstore(f"{sst.submitted_user_load}/faiss_index.index")
if "vectorstore" in sst:
if new_db is not None: # Check if this is working
sst.vectorstore.merge_from(new_db)
sst.conversation = get_conversation_chain(sst.vectorstore)
st.sidebar.success("faiss loaded")
else:
if new_db is not None: # Check if this is working
sst.vectorstore = new_db
sst.conversation = get_conversation_chain(new_db)
st.sidebar.success("faiss loaded")
else:
st.sidebar.warning("Couldn't load/find embeddings")
st.header("Anna Seiler Haus KI-Assistent ASH :hospital:")
if st.text_input("ASK_ASH_PASSWORD: ", type="password") == ASK_ASH_PASSWORD:
#user_question = st.text_input("Ask a question about your documents:", key="user_query", on_change=handle_query)
st.text_input('Ask a question about your documents:', key='widget_user_query', on_change=submit_user_query)
#sst.openai = st.toggle(label="use openai?")
if sst.submitted_user_query:
if "vectorstore" in sst:
handle_userinput(sst.submitted_user_query)
else:
st.warning("no vectorstore loaded.")
with st.sidebar:
st.subheader("Your documents")
pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
if st.button("Process"):
with st.spinner("Processing"):
vec = get_faiss_vectorstore(get_text_chunks(get_pdf_text(pdf_docs)))
sst.vectorstore = vec
sst.conversation = get_conversation_chain(vec)
st.success("embedding complete")
st.text_input('Safe Embeddings to: (copy path of folder)', key='widget_user_safe',
on_change=submit_user_safe)
st.text_input('Load Embeddings from: (copy path of folder)', key='widget_user_load',
on_change=submit_user_load)
if __name__ == '__main__':
sst = st.session_state
ASK_ASH_PASSWORD = os.environ["ASK_ASH_PASSWORD"]
main()
|