1MR commited on
Commit
f87882c
·
verified ·
1 Parent(s): 0a653ed

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +264 -94
app.py CHANGED
@@ -1,124 +1,294 @@
1
  import streamlit as st
2
- # from dotenv import load_dotenv
3
  from PyPDF2 import PdfReader
4
- from langchain.text_splitter import CharacterTextSplitter
5
- from langchain_community.embeddings import HuggingFaceInstructEmbeddings
6
- from langchain_community.vectorstores import FAISS
7
- # from langchain.chat_models import ChatOpenAI
 
8
  from langchain.memory import ConversationBufferMemory
9
  from langchain.chains import ConversationalRetrievalChain
10
  from htmlTemplates import css, bot_template, user_template
11
- from langchain_community.llms import HuggingFaceHub
 
 
12
  import os
13
- # from sentence_transformers import SentenceTransformer
14
- from langchain.embeddings import HuggingFaceEmbeddings
15
 
16
 
17
- # from huggingface_hub import login
 
 
 
 
 
 
 
 
18
 
19
- # Retrieve the Hugging Face token from environment variables
20
- # token = os.getenv("HUGGINGFACEHUB_TOKEN")
21
- import fitz # PyMuPDF
22
 
23
- def get_pdf_text(pdf_docs):
24
- text = ""
25
- for pdf in pdf_docs:
26
- try:
27
- doc = fitz.open(stream=pdf.read(), filetype="pdf")
28
- for page in doc:
29
- text += page.get_text()
30
- except Exception as e:
31
- st.error(f"Could not read the file: {pdf.name}. Error: {e}")
32
- return text
33
- # def get_pdf_text(pdf_docs):
34
- # text = ""
35
- # for pdf in pdf_docs:
36
- # pdf_reader = PdfReader(pdf)
37
- # for page in pdf_reader.pages:
38
- # text += page.extract_text()
39
- # return text
40
-
41
- def get_text_chunks(text):
42
- text_splitter=CharacterTextSplitter(
43
- separator="\n",
44
- chunk_size=1000,
45
- chunk_overlap=200,
46
- length_function=len
47
- )
48
- chunks=text_splitter.split_text(text)
49
- return chunks
50
-
51
- # token="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1"
52
- # def get_vectorstore(text_chunks):
53
- # # embeddings=HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl",huggingfacehub_token=os.getenv("TOKEN_API2"))
54
- # embeddings=HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
55
- # vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
56
- # return vectorstore
57
 
58
- # def get_vectorstore(text_chunks):
59
- # # Load a SentenceTransformer model for embeddings
60
- # embedding_model = SentenceTransformer("hkunlp/instructor-xl") # Replace with a model of your choice
61
- # embeddings = [embedding_model.encode(chunk) for chunk in text_chunks]
62
 
63
- # # Create a FAISS vectorstore
64
- # vectorstore = FAISS.from_embeddings(embeddings=embeddings, texts=text_chunks)
65
- # return vectorstore
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
- def get_vectorstore(text_chunks):
68
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
69
- vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
70
- return vectorstore
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  def get_conversation_chain(vectorstore):
73
- llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512},huggingfacehub_api_token=os.getenv("TOKEN_API2"))
74
- memory=ConversationBufferMemory(
75
- memory_key='chat_history',return_messages=True)
76
- conversation_chain = ConversationalRetrievalChain.from_llm(
77
- llm=llm,
78
- retriever=vectorstore.as_retriever(),
79
- memory=memory
80
- )
81
- return conversation_chain
 
 
 
82
 
 
83
  def handle_userinput(user_question):
84
- response = st.session_state.conversation({'question':user_question})
85
- st.session_state.chat_history = response['chat_history']
 
 
 
 
 
 
 
 
 
 
86
 
87
- for i, message in enumerate(st.session_state.chat_history):
88
- if i % 2 == 0:
89
- st.write(user_template.replace("{{MSG}}", message.content),unsafe_allow_html=True)
90
- else:
91
- st.write(bot_template.replace("{{MSG}}", message.content),unsafe_allow_html=True)
92
 
93
  def main():
94
- st.set_page_config(page_title="Chat with My RAG",
95
- page_icon=":books:")
96
- st.write(css,unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
97
 
98
- if "conversation" not in st.session_state:
99
- st.session_state.conversation = None
100
- if "chat_history" not in st.session_state:
101
- st.session_state.chat_history = None
102
 
103
- st.header("Chat with My RAG :books:")
104
- user_question=st.text_input("Ask a question about your documents:")
105
- if user_question:
106
- handle_userinput(user_question)
 
 
 
107
 
108
- with st.sidebar:
109
- st.subheader("Your Documents")
110
- pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
111
- if st.button("Process"):
112
- with st.spinner("Processing"):
113
- raw_text =get_pdf_text(pdf_docs)
 
 
 
 
 
 
 
 
114
 
115
- text_chunks = get_text_chunks(raw_text)
 
116
 
117
- vectorstore = get_vectorstore(text_chunks)
 
118
 
119
- st.session_state.conversation = get_conversation_chain(vectorstore)
 
 
120
 
121
 
122
  if __name__ == '__main__':
123
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
 
1
  import streamlit as st
2
+ from dotenv import load_dotenv
3
  from PyPDF2 import PdfReader
4
+ from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
5
+ from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
6
+ from langchain.vectorstores import FAISS, Chroma
7
+ from langchain.embeddings import HuggingFaceEmbeddings # General embeddings from HuggingFace models.
8
+ from langchain.chat_models import ChatOpenAI
9
  from langchain.memory import ConversationBufferMemory
10
  from langchain.chains import ConversationalRetrievalChain
11
  from htmlTemplates import css, bot_template, user_template
12
+ from langchain.llms import HuggingFaceHub, LlamaCpp, CTransformers # For loading transformer models.
13
+ from langchain.document_loaders import PyPDFLoader, TextLoader, JSONLoader, CSVLoader
14
+ import tempfile # 임시 파일을 생성하기 위한 라이브러리입니다.
15
  import os
 
 
16
 
17
 
18
+ # PDF 문서로부터 텍스트를 추출하는 함수입니다.
19
+ def get_pdf_text(pdf_docs):
20
+ temp_dir = tempfile.TemporaryDirectory() # 임시 디렉토리를 생성합니다.
21
+ temp_filepath = os.path.join(temp_dir.name, pdf_docs.name) # 임시 파일 경로를 생성합니다.
22
+ with open(temp_filepath, "wb") as f: # 임시 파일을 바이너리 쓰기 모드로 엽니다.
23
+ f.write(pdf_docs.getvalue()) # PDF 문서의 내용을 임시 파일에 씁니다.
24
+ pdf_loader = PyPDFLoader(temp_filepath) # PyPDFLoader를 사용해 PDF를 로드합니다.
25
+ pdf_doc = pdf_loader.load() # 텍스트를 추출합니다.
26
+ return pdf_doc # 추출한 텍스트를 반환합니다.
27
 
28
+ # 과제
29
+ # 아래 텍스트 추출 함수를 작성
 
30
 
31
+ def get_text_file(text_docs):
32
+ temp_dir = tempfile.TemporaryDirectory()
33
+ temp_filepath = os.path.join(temp_dir.name, text_docs.name)
34
+ with open(temp_filepath, "wb") as f:
35
+ f.write(text_docs.getvalue())
36
+ text_loader = TextLoader(temp_filepath)
37
+ text_doc = text_loader.load()
38
+ return text_doc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
 
 
 
 
40
 
41
+ def get_csv_file(csv_docs):
42
+ temp_dir = tempfile.TemporaryDirectory()
43
+ temp_filepath = os.path.join(temp_dir.name, csv_docs.name)
44
+ with open(temp_filepath, "wb") as f:
45
+ f.write(csv_docs.getvalue())
46
+ csv_loader = CSVLoader(temp_filepath)
47
+ csv_doc = csv_loader.load()
48
+ return csv_doc
49
+
50
+ def get_json_file(json_docs):
51
+ temp_dir = tempfile.TemporaryDirectory()
52
+ temp_filepath = os.path.join(temp_dir.name, json_docs.name)
53
+ with open(temp_filepath, "wb") as f:
54
+ f.write(json_docs.getvalue())
55
+ json_loader = JSONLoader(temp_filepath)
56
+ json_doc = json_loader.load()
57
+ return json_doc
58
 
 
 
 
 
59
 
60
+ # 문서들을 처리하여 텍스트 청크로 나누는 함수입니다.
61
+ def get_text_chunks(documents):
62
+ text_splitter = RecursiveCharacterTextSplitter(
63
+ chunk_size=1000, # 청크의 크기를 지정합니다.
64
+ chunk_overlap=200, # 청크 사이의 중복을 지정합니다.
65
+ length_function=len # 텍스트의 길이를 측정하는 함수를 지정합니다.
66
+ )
67
+
68
+ documents = text_splitter.split_documents(documents) # 문서들을 청크로 나눕니다
69
+ return documents # 나눈 청크를 반환합니다.
70
+
71
+
72
+ # 텍스트 청크들로부터 벡터 스토어를 생성하는 함수입니다.
73
+ def get_vectorstore(text_chunks):
74
+ # OpenAI 임베딩 모델을 로드합니다. (Embedding models - Ada v2)
75
+
76
+ # embeddings = OpenAIEmbeddings()
77
+ embeddings=HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
78
+ vectorstore = FAISS.from_documents(text_chunks, embeddings) # FAISS 벡터 스토어를 생성합니다.
79
+
80
+ return vectorstore # 생성된 벡터 스토어를 반환합니다.
81
+
82
+
83
  def get_conversation_chain(vectorstore):
84
+ llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512},huggingfacehub_api_token=os.getenv("TOKEN_API2"))
85
+
86
+ # 대화 기록을 저장하기 위한 메모리를 생성합니다.
87
+ memory = ConversationBufferMemory(
88
+ memory_key='chat_history', return_messages=True)
89
+ # 대화 검색 체인을 생성합니다.
90
+ conversation_chain = ConversationalRetrievalChain.from_llm(
91
+ llm=llm,
92
+ retriever=vectorstore.as_retriever(),
93
+ memory=memory
94
+ )
95
+ return conversation_chain
96
 
97
+ # 사용자 입력을 처리하는 함수입니다.
98
  def handle_userinput(user_question):
99
+ # 대화 체인을 사용하여 사용자 질문에 대한 응답을 생성합니다.
100
+ response = st.session_state.conversation({'question': user_question})
101
+ # 대화 기록을 저장합니다.
102
+ st.session_state.chat_history = response['chat_history']
103
+
104
+ for i, message in enumerate(st.session_state.chat_history):
105
+ if i % 2 == 0:
106
+ st.write(user_template.replace(
107
+ "{{MSG}}", message.content), unsafe_allow_html=True)
108
+ else:
109
+ st.write(bot_template.replace(
110
+ "{{MSG}}", message.content), unsafe_allow_html=True)
111
 
 
 
 
 
 
112
 
113
  def main():
114
+ load_dotenv()
115
+ st.set_page_config(page_title="Chat with multiple Files",
116
+ page_icon=":books:")
117
+ st.write(css, unsafe_allow_html=True)
118
+
119
+ if "conversation" not in st.session_state:
120
+ st.session_state.conversation = None
121
+ if "chat_history" not in st.session_state:
122
+ st.session_state.chat_history = None
123
+
124
+ st.header("Chat with multiple Files :")
125
+ user_question = st.text_input("Ask a question about your documents:")
126
+ if user_question:
127
+ handle_userinput(user_question)
128
 
129
+ with st.sidebar:
130
+ openai_key = st.text_input("Paste your OpenAI API key (sk-...)")
131
+ if openai_key:
132
+ os.environ["OPENAI_API_KEY"] = openai_key
133
 
134
+ st.subheader("Your documents")
135
+ docs = st.file_uploader(
136
+ "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
137
+ if st.button("Process"):
138
+ with st.spinner("Processing"):
139
+ # get pdf text
140
+ doc_list = []
141
 
142
+ for file in docs:
143
+ print('file - type : ', file.type)
144
+ if file.type == 'text/plain':
145
+ # file is .txt
146
+ doc_list.extend(get_text_file(file))
147
+ elif file.type in ['application/octet-stream', 'application/pdf']:
148
+ # file is .pdf
149
+ doc_list.extend(get_pdf_text(file))
150
+ elif file.type == 'text/csv':
151
+ # file is .csv
152
+ doc_list.extend(get_csv_file(file))
153
+ elif file.type == 'application/json':
154
+ # file is .json
155
+ doc_list.extend(get_json_file(file))
156
 
157
+ # get the text chunks
158
+ text_chunks = get_text_chunks(doc_list)
159
 
160
+ # create vector store
161
+ vectorstore = get_vectorstore(text_chunks)
162
 
163
+ # create conversation chain
164
+ st.session_state.conversation = get_conversation_chain(
165
+ vectorstore)
166
 
167
 
168
  if __name__ == '__main__':
169
+ main()
170
+
171
+ # import streamlit as st
172
+ # # from dotenv import load_dotenv
173
+ # from PyPDF2 import PdfReader
174
+ # from langchain.text_splitter import CharacterTextSplitter
175
+ # from langchain_community.embeddings import HuggingFaceInstructEmbeddings
176
+ # from langchain_community.vectorstores import FAISS
177
+ # # from langchain.chat_models import ChatOpenAI
178
+ # from langchain.memory import ConversationBufferMemory
179
+ # from langchain.chains import ConversationalRetrievalChain
180
+ # from htmlTemplates import css, bot_template, user_template
181
+ # from langchain_community.llms import HuggingFaceHub
182
+ # import os
183
+ # # from sentence_transformers import SentenceTransformer
184
+ # from langchain.embeddings import HuggingFaceEmbeddings
185
+
186
+
187
+ # # from huggingface_hub import login
188
+
189
+ # # Retrieve the Hugging Face token from environment variables
190
+ # # token = os.getenv("HUGGINGFACEHUB_TOKEN")
191
+ # import fitz # PyMuPDF
192
+
193
+ # def get_pdf_text(pdf_docs):
194
+ # text = ""
195
+ # for pdf in pdf_docs:
196
+ # try:
197
+ # doc = fitz.open(stream=pdf.read(), filetype="pdf")
198
+ # for page in doc:
199
+ # text += page.get_text()
200
+ # except Exception as e:
201
+ # st.error(f"Could not read the file: {pdf.name}. Error: {e}")
202
+ # return text
203
+ # # def get_pdf_text(pdf_docs):
204
+ # # text = ""
205
+ # # for pdf in pdf_docs:
206
+ # # pdf_reader = PdfReader(pdf)
207
+ # # for page in pdf_reader.pages:
208
+ # # text += page.extract_text()
209
+ # # return text
210
+
211
+ # def get_text_chunks(text):
212
+ # text_splitter=CharacterTextSplitter(
213
+ # separator="\n",
214
+ # chunk_size=1000,
215
+ # chunk_overlap=200,
216
+ # length_function=len
217
+ # )
218
+ # chunks=text_splitter.split_text(text)
219
+ # return chunks
220
+
221
+ # # token="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1"
222
+ # # def get_vectorstore(text_chunks):
223
+ # # # embeddings=HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl",huggingfacehub_token=os.getenv("TOKEN_API2"))
224
+ # # embeddings=HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
225
+ # # vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
226
+ # # return vectorstore
227
+
228
+ # # def get_vectorstore(text_chunks):
229
+ # # # Load a SentenceTransformer model for embeddings
230
+ # # embedding_model = SentenceTransformer("hkunlp/instructor-xl") # Replace with a model of your choice
231
+ # # embeddings = [embedding_model.encode(chunk) for chunk in text_chunks]
232
+
233
+ # # # Create a FAISS vectorstore
234
+ # # vectorstore = FAISS.from_embeddings(embeddings=embeddings, texts=text_chunks)
235
+ # # return vectorstore
236
+
237
+ # def get_vectorstore(text_chunks):
238
+ # embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
239
+ # vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
240
+ # return vectorstore
241
+
242
+ # def get_conversation_chain(vectorstore):
243
+ # llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512},huggingfacehub_api_token=os.getenv("TOKEN_API2"))
244
+ # memory=ConversationBufferMemory(
245
+ # memory_key='chat_history',return_messages=True)
246
+ # conversation_chain = ConversationalRetrievalChain.from_llm(
247
+ # llm=llm,
248
+ # retriever=vectorstore.as_retriever(),
249
+ # memory=memory
250
+ # )
251
+ # return conversation_chain
252
+
253
+ # def handle_userinput(user_question):
254
+ # response = st.session_state.conversation({'question':user_question})
255
+ # st.session_state.chat_history = response['chat_history']
256
+
257
+ # for i, message in enumerate(st.session_state.chat_history):
258
+ # if i % 2 == 0:
259
+ # st.write(user_template.replace("{{MSG}}", message.content),unsafe_allow_html=True)
260
+ # else:
261
+ # st.write(bot_template.replace("{{MSG}}", message.content),unsafe_allow_html=True)
262
+
263
+ # def main():
264
+ # st.set_page_config(page_title="Chat with My RAG",
265
+ # page_icon=":books:")
266
+ # st.write(css,unsafe_allow_html=True)
267
+
268
+ # if "conversation" not in st.session_state:
269
+ # st.session_state.conversation = None
270
+ # if "chat_history" not in st.session_state:
271
+ # st.session_state.chat_history = None
272
+
273
+ # st.header("Chat with My RAG :books:")
274
+ # user_question=st.text_input("Ask a question about your documents:")
275
+ # if user_question:
276
+ # handle_userinput(user_question)
277
+
278
+ # with st.sidebar:
279
+ # st.subheader("Your Documents")
280
+ # pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
281
+ # if st.button("Process"):
282
+ # with st.spinner("Processing"):
283
+ # raw_text =get_pdf_text(pdf_docs)
284
+
285
+ # text_chunks = get_text_chunks(raw_text)
286
+
287
+ # vectorstore = get_vectorstore(text_chunks)
288
+
289
+ # st.session_state.conversation = get_conversation_chain(vectorstore)
290
+
291
+
292
+ # if __name__ == '__main__':
293
+ # main()
294