patel18 commited on
Commit
a6fe9a8
·
verified ·
1 Parent(s): ca54725

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +167 -44
app.py CHANGED
@@ -1,20 +1,124 @@
1
- import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
 
3
  import streamlit as st
4
  from dotenv import load_dotenv
5
  from PyPDF2 import PdfReader
 
6
  from langchain.text_splitter import CharacterTextSplitter
7
- from langchain_openai import OpenAIEmbeddings
8
  from langchain.vectorstores import FAISS
9
- # from langchain_community.vectorstores import FAISS
10
- from langchain.embeddings import HuggingFaceEmbeddings
11
  from langchain.memory import ConversationBufferMemory
12
  from langchain.chains import ConversationalRetrievalChain
13
- from langchain.chat_models import ChatOpenAI
14
- from htmlTemplates import css, bot_template, user_template
15
- from langchain.embeddings import HuggingFaceInstructEmbeddings
16
  from langchain.llms import HuggingFaceHub
17
- import os
 
 
 
18
  def get_pdf_text(pdf_doc):
19
  text = ""
20
  for pdf in pdf_doc:
@@ -23,54 +127,69 @@ def get_pdf_text(pdf_doc):
23
  text += page.extract_text()
24
  return text
25
 
 
 
 
 
 
 
26
 
 
27
  def get_text_chunk(row_text):
28
  text_splitter = CharacterTextSplitter(
29
  separator="\n",
30
- chunk_size = 1000,
31
- chunk_overlap = 200,
32
- length_function = len
33
  )
34
  chunk = text_splitter.split_text(row_text)
35
  return chunk
36
 
37
-
38
  def get_vectorstore(text_chunk):
39
- #embeddings = OpenAIEmbeddings(openai_api_key = os.getenv("OPENAI_API_KEY"))
40
- embeddings = HuggingFaceInstructEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
41
- vector = FAISS.from_texts(text_chunk,embeddings)
42
  return vector
43
 
44
-
45
  def get_conversation_chain(vectorstores):
46
- #llm = ChatOpenAI(openai_api_key = os.getenv("OPENAI_API_KEY"))
47
- llm = HuggingFaceHub(repo_id="google/flan-t5-base", model_kwargs={"temperature":0.5, "max_length":512})
48
- memory = ConversationBufferMemory(memory_key = "chat_history",return_messages = True)
49
- conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm,
50
- retriever=vectorstores.as_retriever(),
51
- memory=memory)
 
52
  return conversation_chain
53
 
54
-
55
  def user_input(user_question):
56
- response = st.session_state.conversation({"question":user_question})
57
  st.session_state.chat_history = response["chat_history"]
58
 
59
  for indx, msg in enumerate(st.session_state.chat_history):
60
- if indx % 2==0:
61
- st.write(user_template.replace("{{MSG}}",msg.content), unsafe_allow_html=True)
62
  else:
63
  st.write(bot_template.replace("{{MSG}}", msg.content), unsafe_allow_html=True)
64
 
 
 
 
 
 
 
65
 
66
-
67
  def main():
68
- # load secret key
69
  load_dotenv()
70
-
71
- # config the pg
72
- st.set_page_config(page_title="Chat with multiple PDFs" ,page_icon=":books:")
73
  st.write(css, unsafe_allow_html=True)
 
74
  if "conversation" not in st.session_state:
75
  st.session_state.conversation = None
76
 
@@ -79,27 +198,31 @@ def main():
79
  if user_question:
80
  user_input(user_question)
81
 
82
- # st.write(user_template.replace("{{MSG}}","Hello Robot"), unsafe_allow_html=True)
83
- # st.write(bot_template.replace("{{MSG}}","Hello Human"), unsafe_allow_html=True)
84
-
85
- # create side bar
86
  with st.sidebar:
87
  st.subheader("Your Documents")
88
- pdf_doc = st.file_uploader(label="Upload your documents",accept_multiple_files=True)
89
  if st.button("Process"):
90
  with st.spinner(text="Processing"):
91
-
92
- # get pdf text
93
  row_text = get_pdf_text(pdf_doc)
94
- # get the text chunk
95
  text_chunk = get_text_chunk(row_text)
96
- # st.write(text_chunk)
97
- # create vecor store
98
  vectorstores = get_vectorstore(text_chunk)
99
- # st.write(vectorstores)
100
- # create conversation chain
101
  st.session_state.conversation = get_conversation_chain(vectorstores)
102
 
 
 
 
 
 
 
 
 
 
 
103
 
104
  if __name__ == "__main__":
105
- main()
 
1
+ # import os
2
+
3
+ # import streamlit as st
4
+ # from dotenv import load_dotenv
5
+ # from PyPDF2 import PdfReader
6
+ # from langchain.text_splitter import CharacterTextSplitter
7
+ # from langchain_openai import OpenAIEmbeddings
8
+ # from langchain.vectorstores import FAISS
9
+ # # from langchain_community.vectorstores import FAISS
10
+ # from langchain.embeddings import HuggingFaceEmbeddings
11
+ # from langchain.memory import ConversationBufferMemory
12
+ # from langchain.chains import ConversationalRetrievalChain
13
+ # from langchain.chat_models import ChatOpenAI
14
+ # from htmlTemplates import css, bot_template, user_template
15
+ # from langchain.embeddings import HuggingFaceInstructEmbeddings
16
+ # from langchain.llms import HuggingFaceHub
17
+ # import os
18
+ # def get_pdf_text(pdf_doc):
19
+ # text = ""
20
+ # for pdf in pdf_doc:
21
+ # pdf_reader = PdfReader(pdf)
22
+ # for page in pdf_reader.pages:
23
+ # text += page.extract_text()
24
+ # return text
25
+
26
+
27
+ # def get_text_chunk(row_text):
28
+ # text_splitter = CharacterTextSplitter(
29
+ # separator="\n",
30
+ # chunk_size = 1000,
31
+ # chunk_overlap = 200,
32
+ # length_function = len
33
+ # )
34
+ # chunk = text_splitter.split_text(row_text)
35
+ # return chunk
36
+
37
+
38
+ # def get_vectorstore(text_chunk):
39
+ # #embeddings = OpenAIEmbeddings(openai_api_key = os.getenv("OPENAI_API_KEY"))
40
+ # embeddings = HuggingFaceInstructEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
41
+ # vector = FAISS.from_texts(text_chunk,embeddings)
42
+ # return vector
43
+
44
+
45
+ # def get_conversation_chain(vectorstores):
46
+ # #llm = ChatOpenAI(openai_api_key = os.getenv("OPENAI_API_KEY"))
47
+ # llm = HuggingFaceHub(repo_id="google/flan-t5-base", model_kwargs={"temperature":0.5, "max_length":512})
48
+ # memory = ConversationBufferMemory(memory_key = "chat_history",return_messages = True)
49
+ # conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm,
50
+ # retriever=vectorstores.as_retriever(),
51
+ # memory=memory)
52
+ # return conversation_chain
53
+
54
+
55
+ # def user_input(user_question):
56
+ # response = st.session_state.conversation({"question":user_question})
57
+ # st.session_state.chat_history = response["chat_history"]
58
+
59
+ # for indx, msg in enumerate(st.session_state.chat_history):
60
+ # if indx % 2==0:
61
+ # st.write(user_template.replace("{{MSG}}",msg.content), unsafe_allow_html=True)
62
+ # else:
63
+ # st.write(bot_template.replace("{{MSG}}", msg.content), unsafe_allow_html=True)
64
+
65
+
66
+
67
+ # def main():
68
+ # # load secret key
69
+ # load_dotenv()
70
+
71
+ # # config the pg
72
+ # st.set_page_config(page_title="Chat with multiple PDFs" ,page_icon=":books:")
73
+ # st.write(css, unsafe_allow_html=True)
74
+ # if "conversation" not in st.session_state:
75
+ # st.session_state.conversation = None
76
+
77
+ # st.header("Chat with multiple PDFs :books:")
78
+ # user_question = st.text_input("Ask a question about your docs")
79
+ # if user_question:
80
+ # user_input(user_question)
81
+
82
+ # # st.write(user_template.replace("{{MSG}}","Hello Robot"), unsafe_allow_html=True)
83
+ # # st.write(bot_template.replace("{{MSG}}","Hello Human"), unsafe_allow_html=True)
84
+
85
+ # # create side bar
86
+ # with st.sidebar:
87
+ # st.subheader("Your Documents")
88
+ # pdf_doc = st.file_uploader(label="Upload your documents",accept_multiple_files=True)
89
+ # if st.button("Process"):
90
+ # with st.spinner(text="Processing"):
91
+
92
+ # # get pdf text
93
+ # row_text = get_pdf_text(pdf_doc)
94
+ # # get the text chunk
95
+ # text_chunk = get_text_chunk(row_text)
96
+ # # st.write(text_chunk)
97
+ # # create vecor store
98
+ # vectorstores = get_vectorstore(text_chunk)
99
+ # # st.write(vectorstores)
100
+ # # create conversation chain
101
+ # st.session_state.conversation = get_conversation_chain(vectorstores)
102
+
103
+
104
+ # if __name__ == "__main__":
105
+ # main()
106
 
107
+ import os
108
  import streamlit as st
109
  from dotenv import load_dotenv
110
  from PyPDF2 import PdfReader
111
+ from pdf2image import convert_from_path
112
  from langchain.text_splitter import CharacterTextSplitter
113
+ from sentence_transformers import SentenceTransformer
114
  from langchain.vectorstores import FAISS
 
 
115
  from langchain.memory import ConversationBufferMemory
116
  from langchain.chains import ConversationalRetrievalChain
 
 
 
117
  from langchain.llms import HuggingFaceHub
118
+ from htmlTemplates import css, bot_template, user_template
119
+ from transformers import pipeline
120
+
121
+ # Function to extract text from PDF
122
  def get_pdf_text(pdf_doc):
123
  text = ""
124
  for pdf in pdf_doc:
 
127
  text += page.extract_text()
128
  return text
129
 
130
+ # Function to extract images from PDF
131
+ def get_pdf_images(pdf_doc):
132
+ images = []
133
+ for pdf in pdf_doc:
134
+ images.extend(convert_from_path(pdf))
135
+ return images
136
 
137
+ # Function to split text into chunks
138
  def get_text_chunk(row_text):
139
  text_splitter = CharacterTextSplitter(
140
  separator="\n",
141
+ chunk_size=1000,
142
+ chunk_overlap=200,
143
+ length_function=len
144
  )
145
  chunk = text_splitter.split_text(row_text)
146
  return chunk
147
 
148
+ # Function to create vector store
149
  def get_vectorstore(text_chunk):
150
+ model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
151
+ embeddings = model.encode(text_chunk)
152
+ vector = FAISS.from_embeddings(embeddings)
153
  return vector
154
 
155
+ # Function to create conversation chain
156
  def get_conversation_chain(vectorstores):
157
+ llm = HuggingFaceHub(repo_id="google/flan-t5-base", model_kwargs={"temperature": 0.5, "max_length": 512})
158
+ memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
159
+ conversation_chain = ConversationalRetrievalChain.from_llm(
160
+ llm=llm,
161
+ retriever=vectorstores.as_retriever(),
162
+ memory=memory
163
+ )
164
  return conversation_chain
165
 
166
+ # Function to handle user input
167
  def user_input(user_question):
168
+ response = st.session_state.conversation({"question": user_question})
169
  st.session_state.chat_history = response["chat_history"]
170
 
171
  for indx, msg in enumerate(st.session_state.chat_history):
172
+ if indx % 2 == 0:
173
+ st.write(user_template.replace("{{MSG}}", msg.content), unsafe_allow_html=True)
174
  else:
175
  st.write(bot_template.replace("{{MSG}}", msg.content), unsafe_allow_html=True)
176
 
177
+ # Function to generate images from text using a DALL-E model
178
+ def generate_image_from_text(prompt):
179
+ # Ensure you have a DALL-E or similar model for text-to-image generation
180
+ generator = pipeline("text-to-image", model="dalle-mini/dalle-mini")
181
+ images = generator(prompt)
182
+ return images
183
 
184
+ # Main function
185
  def main():
186
+ # Load secret key
187
  load_dotenv()
188
+
189
+ # Config the page
190
+ st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
191
  st.write(css, unsafe_allow_html=True)
192
+
193
  if "conversation" not in st.session_state:
194
  st.session_state.conversation = None
195
 
 
198
  if user_question:
199
  user_input(user_question)
200
 
201
+ # Create side bar
 
 
 
202
  with st.sidebar:
203
  st.subheader("Your Documents")
204
+ pdf_doc = st.file_uploader(label="Upload your documents", accept_multiple_files=True, type=["pdf"])
205
  if st.button("Process"):
206
  with st.spinner(text="Processing"):
207
+ # Get PDF text
 
208
  row_text = get_pdf_text(pdf_doc)
209
+ # Get the text chunk
210
  text_chunk = get_text_chunk(row_text)
211
+ # Create vector store
 
212
  vectorstores = get_vectorstore(text_chunk)
213
+ # Create conversation chain
 
214
  st.session_state.conversation = get_conversation_chain(vectorstores)
215
 
216
+ # Extract and display images from PDFs
217
+ images = get_pdf_images(pdf_doc)
218
+ for img in images:
219
+ st.image(img)
220
+
221
+ # Generate and display images from text using DALL-E
222
+ if user_question:
223
+ generated_images = generate_image_from_text(user_question)
224
+ for gen_img in generated_images:
225
+ st.image(gen_img)
226
 
227
  if __name__ == "__main__":
228
+ main()