RAHMAN00700 commited on
Commit
3e1f2bb
·
unverified ·
1 Parent(s): 5bfe977

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +145 -41
app.py CHANGED
@@ -2,23 +2,33 @@ import streamlit as st
2
  from dotenv import load_dotenv
3
  from PyPDF2 import PdfReader
4
  from langchain.text_splitter import CharacterTextSplitter
5
- from langchain.embeddings import HuggingFaceEmbeddings
6
  from langchain.vectorstores import FAISS
7
  from langchain.chat_models import ChatOpenAI
8
  from langchain.memory import ConversationBufferMemory
9
  from langchain.chains import ConversationalRetrievalChain
 
10
  from langchain.llms import HuggingFaceHub
11
- from ibm_watson_machine_learning.foundation_models import Model
12
- from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
13
- from ibm_watson_machine_learning.foundation_models.utils.enums import ModelTypes
14
- import requests
15
  import os
 
16
  import tempfile
 
 
 
 
 
17
 
18
- # Load environment variables
19
- load_dotenv()
 
 
 
 
 
 
20
 
21
- # IBM Watson credentials
22
  project_id = os.getenv("PROJECT_ID", None)
23
  credentials = {
24
  "url": "https://us-south.ml.cloud.ibm.com",
@@ -35,21 +45,14 @@ def getBearer(apikey):
35
 
36
  credentials["token"] = getBearer(credentials["apikey"])
37
 
38
- # Use a supported model
39
- model_id = ModelTypes.LLAMA_3_70B_INSTRUCT
40
-
41
- # Initialize Watsonx foundation model
42
- parameters = {
43
- GenParams.DECODING_METHOD: "greedy",
44
- GenParams.MAX_NEW_TOKENS: 500,
45
- GenParams.MIN_NEW_TOKENS: 0,
46
- GenParams.STOP_SEQUENCES: ["\n"],
47
- GenParams.REPETITION_PENALTY: 2
48
- }
49
 
 
50
  llama_model = Model(
51
- model_id=model_id,
52
- params=parameters,
53
  credentials=credentials,
54
  project_id=project_id
55
  )
@@ -70,50 +73,151 @@ def get_text_chunks(text):
70
  chunk_overlap=200,
71
  length_function=len
72
  )
73
- return text_splitter.split_text(text)
 
74
 
75
  # Function to create a vector store
76
  def get_vectorstore(text_chunks):
77
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
78
- return FAISS.from_texts(text_chunks, embeddings)
 
 
79
 
80
  # Function to create a conversation chain
81
  def get_conversation_chain(vectorstore):
82
  llm = llama_model.to_langchain()
83
- memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
84
- return ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(), memory=memory)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
  # Main function
87
  def main():
88
  st.set_page_config(page_title="Chat with your Documents", page_icon=":books:")
89
- st.header("Chat with Research Papers :books:")
90
-
91
  if "conversation" not in st.session_state:
92
  st.session_state.conversation = None
93
-
94
  if "chat_history" not in st.session_state:
95
  st.session_state.chat_history = None
96
-
 
97
  user_question = st.text_input("Ask questions to research paper or upload your documents:")
98
-
99
  if st.button("Search") and user_question:
100
- with st.spinner("Processing your question..."):
101
- prompt = {"question": user_question}
102
- response = llama_model.generate(prompt)
103
- st.write(response)
104
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  with st.sidebar:
106
  st.subheader("Your documents")
107
  pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
108
-
109
- if pdf_docs:
 
110
  if st.button("Process"):
111
- with st.spinner("Processing your documents..."):
112
  raw_text = get_pdf_text(pdf_docs)
113
  text_chunks = get_text_chunks(raw_text)
114
  vectorstore = get_vectorstore(text_chunks)
 
115
  st.session_state.conversation = get_conversation_chain(vectorstore)
116
- st.write("Documents loaded")
 
 
117
 
118
- if __name__ == "__main__":
119
  main()
 
2
  from dotenv import load_dotenv
3
  from PyPDF2 import PdfReader
4
  from langchain.text_splitter import CharacterTextSplitter
5
+ from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
6
  from langchain.vectorstores import FAISS
7
  from langchain.chat_models import ChatOpenAI
8
  from langchain.memory import ConversationBufferMemory
9
  from langchain.chains import ConversationalRetrievalChain
10
+ from htmlTemplates import css, bot_template, user_template
11
  from langchain.llms import HuggingFaceHub
12
+ from langchain.embeddings import HuggingFaceEmbeddings
 
 
 
13
  import os
14
+ import requests
15
  import tempfile
16
+ import pandas as pd
17
+ from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
18
+ from ibm_watson_machine_learning.foundation_models.utils.enums import ModelTypes
19
+ from langchain.vectorstores import FAISS
20
+ from langchain.embeddings import TensorflowHubEmbeddings
21
 
22
+ # Define parameters
23
+ parameters = {
24
+ GenParams.DECODING_METHOD: "greedy",
25
+ GenParams.MAX_NEW_TOKENS: 500,
26
+ GenParams.MIN_NEW_TOKENS: 0,
27
+ GenParams.STOP_SEQUENCES: ["\n"],
28
+ GenParams.REPETITION_PENALTY: 2
29
+ }
30
 
31
+ load_dotenv()
32
  project_id = os.getenv("PROJECT_ID", None)
33
  credentials = {
34
  "url": "https://us-south.ml.cloud.ibm.com",
 
45
 
46
  credentials["token"] = getBearer(credentials["apikey"])
47
 
48
+ # Select supported model type (fixing the issue)
49
+ from ibm_watson_machine_learning.foundation_models import Model
50
+ model_id = "meta-llama/llama-3-70b-instruct" # Use valid model from the supported list
 
 
 
 
 
 
 
 
51
 
52
+ # Initialize the Watsonx foundation model
53
  llama_model = Model(
54
+ model_id=model_id,
55
+ params=parameters,
56
  credentials=credentials,
57
  project_id=project_id
58
  )
 
73
  chunk_overlap=200,
74
  length_function=len
75
  )
76
+ chunks = text_splitter.split_text(text)
77
+ return chunks
78
 
79
  # Function to create a vector store
80
  def get_vectorstore(text_chunks):
81
+ url = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
82
+ embeddings = TensorflowHubEmbeddings(model_url=url)
83
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
84
+ return vectorstore
85
 
86
  # Function to create a conversation chain
87
  def get_conversation_chain(vectorstore):
88
  llm = llama_model.to_langchain()
89
+ memory = ConversationBufferMemory(
90
+ memory_key='chat_history', return_messages=True)
91
+ conversation_chain = ConversationalRetrievalChain.from_llm(
92
+ llm=llm,
93
+ retriever=vectorstore.as_retriever(),
94
+ memory=memory
95
+ )
96
+ return conversation_chain
97
+
98
+ def call_model_flan(question):
99
+ parameters = {
100
+ GenParams.DECODING_METHOD: "greedy",
101
+ GenParams.MAX_NEW_TOKENS: 50,
102
+ GenParams.MIN_NEW_TOKENS: 1,
103
+ GenParams.STOP_SEQUENCES: ["<|endoftext|>"],
104
+ GenParams.REPETITION_PENALTY: 1
105
+ }
106
+
107
+ # Initialize the Watsonx foundation model
108
+ llm_model = Model(
109
+ model_id="meta-llama/llama-3-405b-instruct",
110
+ params=parameters,
111
+ credentials=credentials,
112
+ project_id=project_id
113
+ )
114
+
115
+ prompt = f"Considering the following question, generate 3 keywords most significant to use when searching in the Arxiv API. Provide your response as a Python list: {question}."
116
+ result = llm_model.generate(prompt)['results'][0]['generated_text']
117
+
118
+ # Convert string to a list of individual words
119
+ word_list = result.split(', ')
120
+ return word_list
121
+
122
+ def download_pdf(url, filename):
123
+ response = requests.get(url)
124
+ with open(filename, 'wb') as file:
125
+ file.write(response.content)
126
+
127
+ def download_pdf_files(url_list):
128
+ temp_dir = tempfile.gettempdir() # Get the temporary directory path
129
+ downloaded_files = [] # List to store downloaded file paths
130
+ for i, url in enumerate(url_list):
131
+ filename = os.path.join(temp_dir, f'file_{i+1}.pdf') # Set the absolute path in the temporary directory
132
+ download_pdf(url, filename)
133
+ downloaded_files.append(filename) # Append the file name to the list with the path
134
+ print(f'Downloaded: {filename}')
135
+ return downloaded_files
136
+
137
+ def delete_files_in_temp():
138
+ temp_dir = tempfile.gettempdir() # Get the temporary directory path
139
+ for file in os.listdir(temp_dir):
140
+ file_path = os.path.join(temp_dir, file)
141
+ try:
142
+ if os.path.isfile(file_path):
143
+ os.unlink(file_path)
144
+ print(f"Deleted: {file_path}")
145
+ except Exception as e:
146
+ print(f"Failed to delete {file_path}: {e}")
147
+
148
+ def arxiv_search(topic):
149
+ import arxiv
150
+ titles = []
151
+ pdf_url = []
152
+ search = arxiv.Search(
153
+ query=topic,
154
+ max_results=5,
155
+ sort_by=arxiv.SortCriterion.Relevance
156
+ )
157
+ titles = [result.title for result in arxiv.Client().results(search)]
158
+ pdf_url = [result.pdf_url for result in arxiv.Client().results(search)]
159
+ url_list = pdf_url
160
+ downloaded_files = download_pdf_files(url_list)
161
+ return downloaded_files, titles
162
+
163
+ # Function to handle user input and display responses
164
+ def handle_user_input(user_question, titles=None):
165
+ prompt = {"question": user_question}
166
+ response = st.session_state.conversation(prompt)
167
+ st.session_state.chat_history = response['chat_history']
168
+ for i, message in enumerate(st.session_state.chat_history):
169
+ template = user_template if i % 2 == 0 else bot_template
170
+ st.write(template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
171
 
172
  # Main function
173
  def main():
174
  st.set_page_config(page_title="Chat with your Documents", page_icon=":books:")
175
+ st.write(css, unsafe_allow_html=True)
176
+
177
  if "conversation" not in st.session_state:
178
  st.session_state.conversation = None
 
179
  if "chat_history" not in st.session_state:
180
  st.session_state.chat_history = None
181
+
182
+ st.header("Chat with Research papers :books:")
183
  user_question = st.text_input("Ask questions to research paper or upload your documents:")
184
+
185
  if st.button("Search") and user_question:
186
+ with st.spinner("Analyzing query"):
187
+ original_list = call_model_flan(user_question)
188
+ unique_list = list(set(original_list))
189
+ topic = ' '.join(unique_list) # full topic creation
190
+ with st.spinner("Searching in Database: " + topic):
191
+ downloaded_files, titles = arxiv_search(topic)
192
+ with st.spinner("Vectorizing results"):
193
+ # Get PDF text and split into chunks
194
+ raw_text = get_pdf_text(downloaded_files)
195
+ text_chunks = get_text_chunks(raw_text)
196
+ # Create vector store and conversation chain
197
+ vectorstore = get_vectorstore(text_chunks)
198
+ st.write("Documents loaded")
199
+ st.session_state.conversation = get_conversation_chain(vectorstore)
200
+ if titles is not None:
201
+ enumerated_strings = [f"{index + 1}. {value}" for index, value in enumerate(titles)]
202
+ combined_string = ', <br> '.join(enumerated_strings)
203
+ st.write(bot_template.replace("{{MSG}}", "Relevant papers found: " + combined_string), unsafe_allow_html=True)
204
+
205
  with st.sidebar:
206
  st.subheader("Your documents")
207
  pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
208
+ if not pdf_docs:
209
+ st.write('You can add your document')
210
+ else:
211
  if st.button("Process"):
212
+ with st.spinner("Processing"):
213
  raw_text = get_pdf_text(pdf_docs)
214
  text_chunks = get_text_chunks(raw_text)
215
  vectorstore = get_vectorstore(text_chunks)
216
+ st.write("Document loaded")
217
  st.session_state.conversation = get_conversation_chain(vectorstore)
218
+
219
+ if user_question and st.session_state.conversation is not None:
220
+ handle_user_input(user_question)
221
 
222
+ if __name__ == '__main__':
223
  main()