File size: 8,005 Bytes
c2ccc8e
971e413
 
 
ebf428d
 
 
971e413
 
c2ccc8e
971e413
1f54d58
971e413
22a9f10
c2ccc8e
971e413
 
 
 
 
 
 
 
22a9f10
c2ccc8e
 
 
 
 
 
 
 
 
22a9f10
c2ccc8e
 
 
 
 
 
 
 
 
22a9f10
c2ccc8e
 
 
 
 
 
 
 
 
22a9f10
c2ccc8e
 
971e413
 
 
c2ccc8e
22a9f10
971e413
 
 
22a9f10
c2ccc8e
e73a2ca
971e413
 
22a9f10
 
 
 
96574b8
 
 
 
 
3934d85
96574b8
3934d85
 
 
 
 
e5e6ae5
3934d85
 
 
 
 
 
68ad46b
22a9f10
 
971e413
 
 
 
e5e6ae5
971e413
4fe6caf
971e413
4fe6caf
22a9f10
c2ccc8e
22a9f10
f27fb7b
 
51488c8
22a9f10
971e413
22a9f10
 
51488c8
971e413
c2ccc8e
22a9f10
 
 
202a39d
 
22a9f10
202a39d
 
22a9f10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2ccc8e
 
bd8f0ee
971e413
 
c2ccc8e
e0ef6cf
 
 
 
22a9f10
bf9e95f
 
 
 
 
22a9f10
c2ccc8e
 
bf9e95f
 
 
 
c2ccc8e
22a9f10
971e413
 
 
bf9e95f
 
 
 
 
 
 
 
 
 
 
 
22a9f10
bf9e95f
22a9f10
 
bf9e95f
22a9f10
 
bf9e95f
 
 
 
971e413
22a9f10
c2ccc8e
bd8f0ee
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import streamlit as st
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from huggingface_hub import InferenceClient
import tempfile
import os
from langchain_community.document_loaders import PyPDFLoader, TextLoader, JSONLoader, CSVLoader
from htmlTemplates import css, bot_template, user_template


def get_pdf_text(pdf_docs):
    temp_dir = tempfile.TemporaryDirectory()
    temp_filepath = os.path.join(temp_dir.name, pdf_docs.name)
    with open(temp_filepath, "wb") as f:
        f.write(pdf_docs.getvalue())
    pdf_loader = PyPDFLoader(temp_filepath)
    pdf_doc = pdf_loader.load()
    return pdf_doc


def get_text_file(text_docs):
    temp_dir = tempfile.TemporaryDirectory()
    temp_filepath = os.path.join(temp_dir.name, text_docs.name)
    with open(temp_filepath, "wb") as f:
        f.write(text_docs.getvalue())
    text_loader = TextLoader(temp_filepath)
    text_doc = text_loader.load()
    return text_doc


def get_csv_file(csv_docs):
    temp_dir = tempfile.TemporaryDirectory()
    temp_filepath = os.path.join(temp_dir.name, csv_docs.name)
    with open(temp_filepath, "wb") as f:
        f.write(csv_docs.getvalue())
    csv_loader = CSVLoader(temp_filepath)
    csv_doc = csv_loader.load()
    return csv_doc


def get_json_file(json_docs):
    temp_dir = tempfile.TemporaryDirectory()
    temp_filepath = os.path.join(temp_dir.name, json_docs.name)
    with open(temp_filepath, "wb") as f:
        f.write(json_docs.getvalue())
    json_loader = JSONLoader(temp_filepath)
    json_doc = json_loader.load()
    return json_doc


def get_text_chunks(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,  
        chunk_overlap=100,
        length_function=len  
    )

    documents = text_splitter.split_documents(documents)
    return documents


def get_vectorstore(text_chunks):
    embeddings = HuggingFaceEmbeddings(model_name="WhereIsAI/UAE-Large-V1")
    vectorstore = FAISS.from_documents(text_chunks, embeddings)
    return vectorstore
#sentence-transformers/all-MiniLM-L6-v2
#HuggingFaceH4/zephyr-7b-alpha
#Qwen/Qwen2.5-72B-Instruct
#mistralai/Mistral-7B-Instruct-v0.2
def get_conversation_chain(vectorstore, tokenH):
    if not tokenH:
        raise ValueError("API token is required to initialize the HuggingFaceHub model")

    try:
        client = InferenceClient(api_key=tokenH)
    except Exception as e:
        raise ValueError(f"Error initializing HuggingFace InferenceClient: {str(e)}")

    def generate_response(messages):
        try:
            completion = client.chat.completions.create(
                model="Qwen/Qwen2.5-72B-Instruct", 
                messages=messages, 
                max_tokens=500
            )
            return completion.choices[0].message['content']
        except Exception as e:
            raise ValueError(f"Error generating response: {str(e)}")

        # messages = [{"role": "user", "content": user_input}, {"role": "system", "content": documents_text}]

    def conversation_chain(user_input):
        retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})
        documents = retriever.get_relevant_documents(user_input)
        documents_text = "\n".join(doc.page_content for doc in documents)
        messages = [{"role": "user", "content": user_input}, {"role": "system", "content": documents_text}]
        return generate_response(messages)

    return conversation_chain


def handle_userinput(user_question):
    # Ensure chat_history is initialized
    if "chat_history" not in st.session_state:
        st.session_state.chat_history = []

    # Get the response from the conversation
    response = st.session_state.conversation(user_question)

    # Append the user's question and the assistant's response to chat history
    st.session_state.chat_history.append({"role": "user", "content": user_question})
    st.session_state.chat_history.append({"role": "assistant", "content": response})

    # Display the chat history
    for message in st.session_state.chat_history:
        if message["role"] == "user":
            st.write(user_template.replace("{{MSG}}", message['content']), unsafe_allow_html=True)
            # st.write(f"<div style='color: white;background-red: lightgray; padding: 0 1.5rem; border-radius: 50%;'>User: {message['content']}</div>", unsafe_allow_html=True)
        else:
            st.write(bot_template.replace("{{MSG}}", message['content']), unsafe_allow_html=True)
            # st.write(f"<div style='color: white;background-color: blue; padding: 0 1.5rem; border-radius: 50%;'>Bot: {message['content']}</div>", unsafe_allow_html=True)

    # for i, message in enumerate(st.session_state.chat_history):
    #     if i % 2 == 0:
    #         # Display user messages
    #         st.write(user_template.replace("{{MSG}}", message["content"]), unsafe_allow_html=True)
    #     else:
    #         # Display assistant messages
    #         st.write(bot_template.replace("{{MSG}}", message["content"]), unsafe_allow_html=True)
            
    # for i, message in enumerate(st.session_state.chat_history):
    #     if i % 2 == 0:
    #         st.write(user_template.replace("{{MSG}}", message.content), unsafe_allow_html=True
    #         # st.write(f"<div style='color: gray;'>User: {message['content']}</div>", unsafe_allow_html=True)
    #     else:
    #         st.write(bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True
    #         # st.write(f"<div style='color: black;'>Bot: {message['content']}</div>", unsafe_allow_html=True)


def main():
    st.set_page_config(page_title="Chat with multiple Files", page_icon=":books:")
    st.header("Chat with Multiple Files")
    tokenH = st.text_input("Paste your HuggingFace API Token (sk-...)")

    if not tokenH:
        st.warning("Please enter a valid HuggingFace API token.")
        return

    # Initialize session state variables
    if "conversation" not in st.session_state:
        st.session_state.conversation = None
    if "chat_history" not in st.session_state:
        st.session_state.chat_history = []

    # User input for questions
    user_question = st.text_input("Ask a question about your documents:")
    if user_question:
        if st.session_state.conversation:
            handle_userinput(user_question)
        else:
            st.warning("Please upload and process files first!")

    # File uploader and processing
    docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
    if st.button("Process"):
        with st.spinner("Processing"):
            if docs:
                doc_list = []
                for file in docs:
                    if file.type == 'text/plain':
                        doc_list.extend(get_text_file(file))
                    elif file.type in ['application/octet-stream', 'application/pdf']:
                        doc_list.extend(get_pdf_text(file))
                    elif file.type == 'text/csv':
                        doc_list.extend(get_csv_file(file))
                    elif file.type == 'application/json':
                        doc_list.extend(get_json_file(file))

                # Generate text chunks
                text_chunks = get_text_chunks(doc_list)

                # Create vector store
                vectorstore = get_vectorstore(text_chunks)

                # Initialize conversation chain
                st.session_state.conversation = get_conversation_chain(vectorstore, tokenH)
                st.success("Documents processed successfully!")
            else:
                st.warning("Please upload at least one document to process.")


if __name__ == '__main__':
    main()