File size: 9,096 Bytes
fcac63a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
"""
complete, functional RAG App
stores vectors in session state, or locally.
add function to display retrieved documents
"""

# import time
from datetime import datetime
# import openai
# import tiktoken
import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from html_templates import css, bot_template, user_template
from langchain.llms import HuggingFaceHub
import os
import numpy as np
import faiss_utils
from langchain_community.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings


def merge_faiss_indices(index1, index2):
    """
    Merge two FAISS indices into a new index, assuming both are of the same type and dimensionality.

    Args:
    index1 (faiss.Index): The first FAISS index.
    index2 (faiss.Index): The second FAISS index.

    Returns:
    faiss.Index: A new FAISS index containing all vectors from index1 and index2.
    """

    # Check if both indices are the same type
    if type(index1) != type(index2):
        raise ValueError("Indices are of different types")

    # Check dimensionality
    if index1.d != index2.d:
        raise ValueError("Indices have different dimensionality")

    # Determine type of indices
    if isinstance(index1, FAISS.IndexFlatL2):
        # Handle simple flat indices
        d = index1.d
        # Extract vectors from both indices
        xb1 = FAISS.rev_swig_ptr(index1.xb.data(), index1.ntotal * d)
        xb2 = FAISS.rev_swig_ptr(index2.xb.data(), index2.ntotal * d)

        # Combine vectors
        xb_combined = np.vstack((xb1, xb2))

        # Create a new index and add combined vectors
        new_index = FAISS.IndexFlatL2(d)
        new_index.add(xb_combined)
        return new_index

    elif isinstance(index1, FAISS.IndexIVFFlat):
        # Handle quantized indices (IndexIVFFlat)
        d = index1.d
        nlist = index1.nlist
        quantizer = FAISS.IndexFlatL2(d)  # Re-create the appropriate quantizer

        # Create a new index with the same configuration
        new_index = FAISS.IndexIVFFlat(quantizer, d, nlist, FAISS.METRIC_L2)

        # If the indices are already trained, you can directly add the vectors
        # Otherwise, you may need to train new_index using a representative subset of vectors
        vecs1 = FAISS.rev_swig_ptr(index1.xb.data(), index1.ntotal * d)
        vecs2 = FAISS.rev_swig_ptr(index2.xb.data(), index2.ntotal * d)
        new_index.add(vecs1)
        new_index.add(vecs2)
        return new_index

    else:
        raise TypeError("Index type not supported for merging in this function")


def get_pdf_text(pdf_docs):
    text = ""
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text


def get_text_chunks(text):
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    chunks = text_splitter.split_text(text)
    return chunks


def get_faiss_vectorstore(text_chunks):
    if sst.openai:
        my_embeddings = OpenAIEmbeddings()
    else:
        my_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=my_embeddings)
    return vectorstore


def get_conversation_chain(vectorstore):
    if sst.openai:
        llm = ChatOpenAI()
    else:
        llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature": 0.5, "max_length": 512})

    memory = ConversationBufferMemory(
        memory_key='chat_history', return_messages=True)
    conversation_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vectorstore.as_retriever(),
        memory=memory
    )
    return conversation_chain


def handle_userinput(user_question):
    response = sst.conversation({'question': user_question})
    sst.chat_history = response['chat_history']

    for i, message in enumerate(sst.chat_history):
        # Display user message
        if i % 2 == 0:
            st.write(user_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
        else:
            print(message)
            # Display AI response
            st.write(bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
            # Display source document information if available in the message
            if hasattr(message, 'source') and message.source:
                st.write(f"Source Document: {message.source}", unsafe_allow_html=True)


if True:
    BASE_URL = "https://api.vectara.io/v1"
    OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
    OPENAI_ORG_ID = os.environ["OPENAI_ORG_ID"]
    PINECONE_API_KEY = os.environ["PINECONE_API_KEY_LCBIM"]
    HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]
    VECTARA_API_KEY = os.environ["VECTARA_API_KEY"]
    VECTARA_CUSTOMER_ID = os.environ["VECTARA_CUSTOMER_ID"]
    headers = {"Authorization": f"Bearer {VECTARA_API_KEY}", "Content-Type": "application/json"}


def main():
    st.set_page_config(page_title="Anna Seiler Haus KI-Assistent", page_icon=":hospital:")
    st.write(css, unsafe_allow_html=True)
    if "conversation" not in sst:
        sst.conversation = None
    if "chat_history" not in sst:
        sst.chat_history = None
    if "page" not in sst:
        sst.page = "home"
    if "openai" not in sst:
        sst.openai = True
    if "login" not in sst:
        sst.login = False
    if 'submitted_user_query' not in sst:
        sst.submitted_user_query = ''
    if 'submitted_user_safe' not in sst:
        sst.submitted_user_safe = ''
    if 'submitted_user_load' not in sst:
        sst.submitted_user_load = ''

    def submit_user_query():
        sst.submitted_user_query = sst.widget_user_query
        sst.widget_user_query = ''

    def submit_user_safe():
        sst.submitted_user_safe = sst.widget_user_safe
        sst.widget_user_safe = ''
        if "vectorstore" in sst:
            # faiss_name = str(datetime.now().strftime("%Y%m%d%H%M%S")) + "faiss_index"
            faiss_utils.save_local(sst.vectorstore, path=sst.submitted_user_safe)
            st.sidebar.success("saved")
        else:
            st.sidebar.warning("No embeddings to save. Please process documents first.")

    def submit_user_load():
        sst.submitted_user_load = sst.widget_user_load
        sst.widget_user_load = ''
        if os.path.exists(sst.submitted_user_load):
            new_db = faiss_utils.load_vectorstore(f"{sst.submitted_user_load}/faiss_index.index")
            if "vectorstore" in sst:
                if new_db is not None:  # Check if this is working
                    sst.vectorstore.merge_from(new_db)
                    sst.conversation = get_conversation_chain(sst.vectorstore)
                    st.sidebar.success("faiss loaded")
            else:
                if new_db is not None:  # Check if this is working
                    sst.vectorstore = new_db
                    sst.conversation = get_conversation_chain(new_db)
                    st.sidebar.success("faiss loaded")
        else:
            st.sidebar.warning("Couldn't load/find embeddings")

    st.header("Anna Seiler Haus KI-Assistent ASH :hospital:")
    if st.text_input("ASK_ASH_PASSWORD: ", type="password") == ASK_ASH_PASSWORD:

        #user_question = st.text_input("Ask a question about your documents:", key="user_query", on_change=handle_query)
        st.text_input('Ask a question about your documents:', key='widget_user_query', on_change=submit_user_query)
        #sst.openai = st.toggle(label="use openai?")

        if sst.submitted_user_query:
            if "vectorstore" in sst:
                handle_userinput(sst.submitted_user_query)
            else:
                st.warning("no vectorstore loaded.")

        with st.sidebar:
            st.subheader("Your documents")
            pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
            if st.button("Process"):
                with st.spinner("Processing"):
                    vec = get_faiss_vectorstore(get_text_chunks(get_pdf_text(pdf_docs)))
                    sst.vectorstore = vec
                    sst.conversation = get_conversation_chain(vec)
                st.success("embedding complete")

            st.text_input('Safe Embeddings to: (copy path of folder)', key='widget_user_safe',
                          on_change=submit_user_safe)

            st.text_input('Load Embeddings from: (copy path of folder)', key='widget_user_load',
                          on_change=submit_user_load)


if __name__ == '__main__':
    sst = st.session_state
    ASK_ASH_PASSWORD = os.environ["ASK_ASH_PASSWORD"]
    main()