rstallman nickmuchi commited on
Commit
fc7ad98
·
0 Parent(s):

Duplicate from nickmuchi/DocGPT

Browse files

Co-authored-by: Nicholas Muchinguri <nickmuchi@users.noreply.huggingface.co>

Files changed (7) hide show
  1. .gitattributes +34 -0
  2. README.md +13 -0
  3. app.py +370 -0
  4. img/logo.jpg +0 -0
  5. img/nm.txt +0 -0
  6. requirements.txt +10 -0
  7. tempdir/nm.txt +0 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: DocGPT
3
+ emoji: 🏃
4
+ colorFrom: green
5
+ colorTo: blue
6
+ sdk: streamlit
7
+ sdk_version: 1.19.0
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: nickmuchi/DocGPT
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,370 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import random
3
+ import itertools
4
+ import streamlit as st
5
+ import validators
6
+ from langchain.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader, WebBaseLoader
7
+ from langchain.vectorstores import FAISS
8
+ from langchain.chat_models import ChatOpenAI
9
+ from langchain.chains import QAGenerationChain
10
+ from langchain.embeddings import HuggingFaceEmbeddings
11
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
12
+ from langchain.callbacks import StdOutCallbackHandler
13
+ from langchain.chains import ConversationalRetrievalChain, QAGenerationChain, LLMChain
14
+ from langchain.memory import ConversationBufferMemory
15
+ from langchain.chains.conversational_retrieval.prompts import CONDENSE_QUESTION_PROMPT
16
+ from langchain.chains.question_answering import load_qa_chain
17
+
18
+ from langchain.prompts.chat import (
19
+ ChatPromptTemplate,
20
+ SystemMessagePromptTemplate,
21
+ AIMessagePromptTemplate,
22
+ HumanMessagePromptTemplate,
23
+ )
24
+
25
+ st.set_page_config(page_title="DOC QA",page_icon=':book:')
26
+
27
+ memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True, output_key='answer')
28
+
29
+
30
+ @st.cache_data
31
+ def save_file_locally(file):
32
+ '''Save uploaded files locally'''
33
+ doc_path = os.path.join('tempdir',file.name)
34
+ with open(doc_path,'wb') as f:
35
+ f.write(file.getbuffer())
36
+
37
+ return doc_path
38
+
39
+ @st.cache_data
40
+ def load_prompt():
41
+
42
+ system_template="""Use only the following pieces of context to answer the users question accurately.
43
+ Do not use any information not provided in the earnings context.
44
+ If you don't know the answer, just say 'There is no relevant answer in the given documents',
45
+ don't try to make up an answer.
46
+
47
+ ALWAYS return a "SOURCES" part in your answer.
48
+ The "SOURCES" part should be a reference to the source of the document from which you got your answer.
49
+
50
+ Remember, do not reference any information not given in the context.
51
+ If the answer is not available in the given context just say 'There is no relevant answer in the given document'
52
+
53
+ Follow the below format when answering:
54
+
55
+ Question: {question}
56
+ SOURCES: [xyz]
57
+
58
+ Begin!
59
+ ----------------
60
+ {context}"""
61
+
62
+ messages = [
63
+ SystemMessagePromptTemplate.from_template(system_template),
64
+ HumanMessagePromptTemplate.from_template("{question}")
65
+ ]
66
+ prompt = ChatPromptTemplate.from_messages(messages)
67
+
68
+ return prompt
69
+
70
+ @st.cache_data
71
+ def load_docs(files, url=False):
72
+
73
+ if not url:
74
+
75
+ st.info("`Reading doc ...`")
76
+ all_text = ""
77
+ documents = []
78
+ for file in files:
79
+ file_extension = os.path.splitext(file.name)[1]
80
+ doc_path = save_file_locally(file)
81
+ if file_extension == ".pdf":
82
+
83
+ pages = PyPDFLoader(doc_path)
84
+
85
+ documents.extend(pages.load())
86
+
87
+ elif file_extension == ".txt":
88
+ #stringio = StringIO(file_path.getvalue().decode("utf-8"))
89
+ pages = TextLoader(doc_path)
90
+ documents.extend(pages.load())
91
+
92
+ elif file_extension == ".docx":
93
+ #stringio = StringIO(file_path.getvalue().decode("utf-8"))
94
+ pages = Docx2txtLoader(doc_path)
95
+ documents.extend(pages.load())
96
+
97
+ else:
98
+ st.warning('Please provide txt or pdf or docx.', icon="⚠️")
99
+
100
+ elif url:
101
+
102
+ st.info("`Reading web link ...`")
103
+
104
+ loader = WebBaseLoader(files)
105
+
106
+ documents = loader.load()
107
+
108
+ return ','.join([doc.page_content for doc in documents])
109
+
110
+ bi_enc_dict = {'mpnet-base-v2':"all-mpnet-base-v2",
111
+ 'instructor-large': 'hkunlp/instructor-large'}
112
+
113
+ @st.cache_data
114
+ def gen_embeddings(model_name):
115
+
116
+ '''Generate embeddings for given model'''
117
+
118
+ if model_name == 'mpnet-base-v2':
119
+ embeddings = HuggingFaceEmbeddings(model_name=bi_enc_dict[model_name])
120
+
121
+ elif model_name == 'instructor-large':
122
+
123
+ embeddings = HuggingFaceInstructEmbeddings(model_name=bi_enc_dict[model_name],
124
+ query_instruction='Represent the question for retrieving supporting paragraphs: ',
125
+ embed_instruction='Represent the paragraph for retrieval: ')
126
+
127
+ return embeddings
128
+
129
+ def load_retrieval_chain(vectorstore):
130
+
131
+ '''Load Chain'''
132
+
133
+ # Initialize the RetrievalQA chain with streaming output
134
+ callback_handler = [StdOutCallbackHandler()]
135
+
136
+ chat_llm = ChatOpenAI(streaming=True,
137
+ model_name = 'gpt-4',
138
+ callbacks=callback_handler,
139
+ verbose=True,
140
+ temperature=0
141
+ )
142
+ question_generator = LLMChain(llm=chat_llm, prompt=CONDENSE_QUESTION_PROMPT)
143
+ doc_chain = load_qa_chain(llm=chat_llm,chain_type="stuff",prompt=load_prompt())
144
+ chain = ConversationalRetrievalChain(retriever=vectorstore.as_retriever(search_kwags={"k": 3}),
145
+ question_generator=question_generator,
146
+ combine_docs_chain=doc_chain,
147
+ memory=memory,
148
+ return_source_documents=True,
149
+ get_chat_history=lambda h :h)
150
+
151
+ return chain
152
+
153
+ @st.cache_resource
154
+ def process_corpus(corpus,model_name, chunk_size=1000, overlap=50):
155
+
156
+ '''Process text for Semantic Search'''
157
+
158
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=overlap)
159
+
160
+ texts = text_splitter.split_text(corpus)
161
+
162
+ # Display the number of text chunks
163
+ num_chunks = len(texts)
164
+ st.write(f"Number of text chunks: {num_chunks}")
165
+
166
+ embeddings = gen_embeddings(model_name)
167
+
168
+ vectorstore = FAISS.from_texts(texts, embeddings)
169
+
170
+ chain = load_retrieval_chain(vectorstore)
171
+
172
+ return chain
173
+
174
+ @st.cache_data
175
+ def run_qa_chain(text,query,model_name):
176
+ '''Run the QnA chain'''
177
+
178
+ chain = process_corpus(text,model_name)
179
+
180
+ answer = chain({"question": query})
181
+
182
+ return answer
183
+
184
+ @st.cache_resource
185
+ def gen_qa_response(text,model_name,user_question):
186
+ '''Generate responses from query'''
187
+
188
+ if user_question:
189
+ result = run_qa_chain(text,user_question,model_name)
190
+
191
+ references = [doc.page_content for doc in result['source_documents']]
192
+ answer = result['answer']
193
+
194
+ with st.expander(label='Query Result', expanded=True):
195
+ st.write(answer)
196
+
197
+ with st.expander(label='References from Corpus used to Generate Result'):
198
+ for ref in references:
199
+ st.write(ref)
200
+
201
+
202
+ # Check if there are no generated question-answer pairs in the session state
203
+ if 'eval_set' not in st.session_state:
204
+ # Use the generate_eval function to generate question-answer pairs
205
+ num_eval_questions = 10 # Number of question-answer pairs to generate
206
+ st.session_state.eval_set = generate_eval(text, num_eval_questions, 3000)
207
+
208
+
209
+ # Display the question-answer pairs in the sidebar with smaller text
210
+ for i, qa_pair in enumerate(st.session_state.eval_set):
211
+ st.sidebar.markdown(
212
+ f"""
213
+ <div class="css-card">
214
+ <span class="card-tag">Question {i + 1}</span>
215
+ <p style="font-size: 12px;">{qa_pair['question']}</p>
216
+ <p style="font-size: 12px;">{qa_pair['answer']}</p>
217
+ </div>
218
+ """,
219
+ unsafe_allow_html=True,
220
+ )
221
+
222
+ st.write("Ready to answer questions.")
223
+
224
+ @st.cache_data
225
+ def generate_eval(raw_text, N, chunk):
226
+
227
+ # Generate N questions from context of chunk chars
228
+ # IN: text, N questions, chunk size to draw question from in the doc
229
+ # OUT: eval set as JSON list
230
+
231
+ # raw_text = ','.join(raw_text)
232
+
233
+ update = st.empty()
234
+ ques_update = st.empty()
235
+ update.info("`Generating sample questions ...`")
236
+ n = len(raw_text)
237
+ starting_indices = [random.randint(0, n-chunk) for _ in range(N)]
238
+ sub_sequences = [raw_text[i:i+chunk] for i in starting_indices]
239
+ chain = QAGenerationChain.from_llm(ChatOpenAI(temperature=0,model_name='gpt-4'))
240
+ eval_set = []
241
+ for i, b in enumerate(sub_sequences):
242
+ try:
243
+
244
+ qa = chain.run(b)
245
+ eval_set.append(qa)
246
+ ques_update.info(f"Creating Question: {i+1}")
247
+
248
+ except:
249
+ st.warning(f'Error in generating Question: {i+1}...', icon="⚠️")
250
+ continue
251
+
252
+ eval_set_full = list(itertools.chain.from_iterable(eval_set))
253
+
254
+ update.empty()
255
+ ques_update.empty()
256
+
257
+ return eval_set_full
258
+
259
+ # Add custom CSS
260
+ st.markdown(
261
+ """
262
+ <style>
263
+
264
+ #MainMenu {visibility: hidden;
265
+ # }
266
+ footer {visibility: hidden;
267
+ }
268
+ .css-card {
269
+ border-radius: 0px;
270
+ padding: 30px 10px 10px 10px;
271
+ background-color: black;
272
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
273
+ margin-bottom: 10px;
274
+ font-family: "IBM Plex Sans", sans-serif;
275
+ }
276
+
277
+ .card-tag {
278
+ border-radius: 0px;
279
+ padding: 1px 5px 1px 5px;
280
+ margin-bottom: 10px;
281
+ position: absolute;
282
+ left: 0px;
283
+ top: 0px;
284
+ font-size: 0.6rem;
285
+ font-family: "IBM Plex Sans", sans-serif;
286
+ color: white;
287
+ background-color: green;
288
+ }
289
+
290
+ .css-zt5igj {left:0;
291
+ }
292
+
293
+ span.css-10trblm {margin-left:0;
294
+ }
295
+
296
+ div.css-1kyxreq {margin-top: -40px;
297
+ }
298
+
299
+ </style>
300
+ """,
301
+ unsafe_allow_html=True,
302
+ )
303
+ st.sidebar.image("img/logo.jpg")
304
+
305
+
306
+ st.write(
307
+ f"""
308
+ <div style="display: flex; align-items: center; margin-left: 0;">
309
+ <h1 style="display: inline-block;">DOC GPT</h1>
310
+ <sup style="margin-left:5px;font-size:small; color: green;">beta</sup>
311
+ </div>
312
+ """,
313
+ unsafe_allow_html=True,
314
+ )
315
+
316
+
317
+ st.sidebar.title("Menu")
318
+
319
+ # Use RecursiveCharacterTextSplitter as the default and only text splitter
320
+ splitter_type = "RecursiveCharacterTextSplitter"
321
+
322
+ uploaded_files = st.file_uploader("Upload a PDF or TXT or DOCX Document", type=[
323
+ "pdf", "txt", "docx"], accept_multiple_files=True)
324
+
325
+ st.markdown(
326
+ "<h3 style='text-align: center; color: red;'>OR</h3>",
327
+ unsafe_allow_html=True,
328
+ )
329
+
330
+ url_text = st.text_input("Please Enter a url here for an html file you would like to load..")
331
+
332
+ bi_enc_dict = {'mpnet-base-v2':"all-mpnet-base-v2",
333
+ 'instructor-base': 'hkunlp/instructor-base'}
334
+
335
+
336
+ model_name = st.sidebar.selectbox("Embedding Model", options=list(bi_enc_dict.keys()), key='sbox')
337
+
338
+ if uploaded_files:
339
+ # Check if last_uploaded_files is not in session_state or if uploaded_files are different from last_uploaded_files
340
+ if 'last_uploaded_files' not in st.session_state or st.session_state.last_uploaded_files != uploaded_files:
341
+ st.session_state.last_uploaded_files = uploaded_files
342
+ if 'eval_set' in st.session_state:
343
+ del st.session_state['eval_set']
344
+
345
+ # Load and process the uploaded PDF or TXT files.
346
+ raw_text = load_docs(uploaded_files)
347
+ st.success("Documents uploaded and processed.")
348
+
349
+ # Question and answering
350
+ user_question = st.text_input("Enter your question:")
351
+
352
+ gen_qa_response(raw_text,model_name, user_question)
353
+
354
+ elif url_text and validators.url(url_text):
355
+
356
+ # Check if last_uploaded_files is not in session_state or if uploaded_files are different from last_uploaded_files
357
+ if 'url_files' not in st.session_state or st.session_state.url_files != url_text:
358
+ st.session_state.url_files = url_text
359
+ if 'eval_set' in st.session_state:
360
+ del st.session_state['eval_set']
361
+
362
+
363
+ # Load and process the uploaded PDF or TXT files.
364
+ loaded_docs = load_docs(url_text,url=True)
365
+ st.success("Web Document uploaded and processed.")
366
+
367
+ gen_qa_response(loaded_docs,model_name)
368
+
369
+
370
+ st.markdown("![visitor badge](https://visitor-badge.glitch.me/badge?page_id=nickmuchi-doc-gpt)")
img/logo.jpg ADDED
img/nm.txt ADDED
File without changes
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ sentence_transformers
3
+ faiss-cpu
4
+ openai
5
+ huggingface_hub
6
+ pypdf
7
+ docx2txt
8
+ validators
9
+ bs4
10
+ altair<5
tempdir/nm.txt ADDED
File without changes