PhilPome commited on
Commit
f8ea069
·
1 Parent(s): bdaeb8e

Upload 2 files

Browse files
Files changed (2) hide show
  1. requirements.txt +10 -0
  2. virtual_consultant.py +370 -0
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ pillow
2
+ beautifulsoup4
3
+ requests
4
+ langchain
5
+ faiss-cpu
6
+ pdfplumber
7
+ openai
8
+ tiktoken
9
+ gradio
10
+ PyPDF2
virtual_consultant.py ADDED
@@ -0,0 +1,370 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pickle
3
+ import urllib
4
+ import requests
5
+ import io
6
+ from collections import Counter
7
+ from pathlib import Path
8
+ import pdfplumber
9
+ from bs4 import BeautifulSoup
10
+ import faiss
11
+
12
+ from langchain.llms import OpenAI
13
+ from langchain.chains import LLMChain, ConstitutionalChain
14
+ from langchain.chains.constitutional_ai.models import ConstitutionalPrinciple
15
+ from langchain import PromptTemplate
16
+ from langchain.text_splitter import CharacterTextSplitter
17
+ from langchain.vectorstores import FAISS
18
+ from langchain.docstore.document import Document
19
+ from langchain.embeddings import OpenAIEmbeddings
20
+ from langchain.chains.qa_with_sources import load_qa_with_sources_chain
21
+ from langchain.document_loaders import PyPDFLoader
22
+
23
+ BING_API_KEY = "0d9d82a6237444a08f148ea23e9d7581"
24
+
25
+
26
+ def scrape_article(url):
27
+ response = requests.get(url)
28
+ soup = BeautifulSoup(response.content, "html.parser")
29
+ paragraphs = soup.find_all("p")
30
+ return " ".join([p.get_text() for p in paragraphs])
31
+
32
+
33
+ def is_not_pdf(url):
34
+ return not url.lower().endswith(".pdf")
35
+
36
+
37
+ def extract_text_from_pdf_url(pdf_url):
38
+ response = requests.get(pdf_url)
39
+ pdf_data = io.BytesIO(response.content)
40
+
41
+ font_stats = []
42
+
43
+ with pdfplumber.open(pdf_data) as pdf:
44
+ for page in pdf.pages:
45
+ chars = page.chars
46
+ for char in chars:
47
+ font_stats.append((char['size'], char['fontname']))
48
+
49
+ most_common_font = Counter(font_stats).most_common(1)[0][0]
50
+
51
+ text = []
52
+ with pdfplumber.open(pdf_data) as pdf:
53
+ for page in pdf.pages:
54
+ chars = page.chars
55
+ page_text = []
56
+ for char in chars:
57
+ if (char['size'], char['fontname']) == most_common_font:
58
+ page_text.append(char['text'])
59
+ text.append("".join(page_text))
60
+
61
+ return "\n".join(text)
62
+
63
+
64
+ def scrape_bing_results(url, n=3):
65
+ headers = {
66
+ "Ocp-Apim-Subscription-Key": BING_API_KEY
67
+ }
68
+ response = requests.get(url, headers=headers)
69
+ results = response.json()
70
+ links = []
71
+
72
+ if 'webPages' in results and 'value' in results['webPages']:
73
+ search_results = results['webPages']['value']
74
+ for result in search_results[:n]:
75
+ link = result['url']
76
+ links.append(link)
77
+
78
+ return links
79
+
80
+
81
+ def get_search_url_bing(query):
82
+ return f"https://api.bing.microsoft.com/v7.0/search?q={urllib.parse.quote_plus(query)}"
83
+
84
+
85
+ class ChatbotAssistant:
86
+ def __init__(self):
87
+ self.temperature = 0.7
88
+ self.BING_API_KEY = "0d9d82a6237444a08f148ea23e9d7581"
89
+ self.openai_api_key = "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"
90
+ self.chain = load_qa_with_sources_chain(
91
+ OpenAI(temperature=self.temperature, openai_api_key=self.openai_api_key))
92
+ self.search_index = None
93
+ self.articles = []
94
+ self.source_urls = []
95
+ self.sources = [
96
+ "https://home.kpmg/",
97
+ "https://www.ibisworld.com",
98
+ "https://www.bcg.com/",
99
+ "https://www.mckinsey.com/",
100
+ "https://www2.deloitte.com/",
101
+ "https://www.pwc.co.uk/",
102
+ "https://www.ey.com/en_gl"
103
+ ]
104
+
105
+ if os.path.exists("search_index.pickle"):
106
+ with open("search_index.pickle", "rb") as f:
107
+ self.search_index = pickle.load(f)
108
+
109
+ self.qa_prompt = PromptTemplate(
110
+ template="Q: {question} A:",
111
+ input_variables=["question"],
112
+ )
113
+ self.qa_chain = LLMChain(llm=OpenAI(temperature=self.temperature, openai_api_key=self.openai_api_key, max_tokens=300), prompt=self.qa_prompt)
114
+
115
+ self.constitutional_chain = ConstitutionalChain.from_llm(
116
+ llm=OpenAI(openai_api_key=self.openai_api_key),
117
+ chain=self.qa_chain,
118
+ constitutional_principles=[
119
+ ConstitutionalPrinciple(
120
+ critique_request="Rate the quality of this answer on a scale of 1 (bad) to 10 (good). If the answer is'I don't know' or similar return a 0.",
121
+ revision_request="Return the rating as a single integer."
122
+ )
123
+ ],
124
+ )
125
+
126
+
127
+ def get_search_url(self, query, site=None):
128
+ if site:
129
+ query = f"site:{site} {query}"
130
+ return f"https://api.bing.microsoft.com/v7.0/search?q={urllib.parse.quote_plus(query)}"
131
+
132
+ def update_search_index(self):
133
+ source_docs = self.articles
134
+ source_chunks = []
135
+ splitter = CharacterTextSplitter(separator=" ", chunk_size=1024, chunk_overlap=0)
136
+ source_counter = 0
137
+
138
+ for source, url in zip(source_docs, self.source_urls):
139
+ for chunk in splitter.split_text(source):
140
+ source_chunks.append(Document(page_content=chunk, metadata={"source": url}))
141
+ source_counter = source_counter + 1
142
+
143
+ with open("search_index.pickle", "wb") as f:
144
+ pickle.dump(FAISS.from_documents(source_chunks, OpenAIEmbeddings(openai_api_key=self.openai_api_key)), f)
145
+
146
+ with open("search_index.pickle", "rb") as f:
147
+ self.search_index = pickle.load(f)
148
+
149
+ def retrieve_articles(self, question):
150
+ self.articles = []
151
+ self.source_urls = []
152
+
153
+ for source in self.sources:
154
+ search_url = self.get_search_url(question, source)
155
+ urls = scrape_bing_results(search_url, 1)
156
+ for url in urls:
157
+ if is_not_pdf(url):
158
+ self.articles.append(scrape_article(url))
159
+ else:
160
+ self.articles.append(extract_text_from_pdf_url(url))
161
+ self.source_urls.append(url)
162
+
163
+ self.update_search_index()
164
+
165
+ def retrieve_alternative_articles(self, question):
166
+ self.articles = []
167
+ self.source_urls = []
168
+
169
+ search_url = get_search_url_bing(question)
170
+ urls = scrape_bing_results(search_url, 5)
171
+ for url in urls:
172
+ if is_not_pdf(url):
173
+ self.articles.append(scrape_article(url))
174
+ else:
175
+ self.articles.append(extract_text_from_pdf_url(url))
176
+ self.source_urls.append(url)
177
+
178
+ self.update_search_index()
179
+
180
+
181
+ def chatbot_assistant(self, question, custom_sources=None, rating_threshold=6):
182
+ # Update the assistant's sources with the provided custom sources
183
+ if custom_sources:
184
+ self.sources = custom_sources
185
+ print(custom_sources)
186
+
187
+
188
+ if self.search_index:
189
+ input_documents = self.search_index.similarity_search(question, k=4)
190
+ answers = self.chain(
191
+ {
192
+ "input_documents": input_documents,
193
+ "question": question,
194
+ },
195
+ return_only_outputs=True,
196
+ )
197
+ answer = answers["output_text"]
198
+
199
+ evaluation = self.constitutional_chain.run(question=answer)
200
+ rating = int(evaluation.strip().split()[-1]) # Extract the rating from the returned text
201
+
202
+ if rating < rating_threshold or "I don't know" in answer:
203
+ print("Launching a new Bing search.")
204
+ self.retrieve_articles(question)
205
+ answers = self.chain(
206
+ {
207
+ "input_documents": input_documents,
208
+ "question": question,
209
+ },
210
+ return_only_outputs=True,
211
+ )
212
+ answer = answers["output_text"]
213
+
214
+ # Check again after retrieving from the original sources
215
+ evaluation = self.constitutional_chain.run(question=answer)
216
+ rating = int(evaluation.strip().split()[-1]) # Extract the rating from the returned text
217
+
218
+ if rating < rating_threshold or "I don't know" in answer:
219
+ self.retrieve_alternative_articles(question)
220
+ answers = self.chain(
221
+ {
222
+ "input_documents": input_documents,
223
+ "question": question,
224
+ },
225
+ return_only_outputs=True,
226
+ )
227
+ answer = answers["output_text"]
228
+ else:
229
+ pass
230
+ else:
231
+ print("Launching a new Bing search.")
232
+ self.retrieve_articles(question)
233
+ input_documents = self.search_index.similarity_search(question, k=4)
234
+ answers = self.chain(
235
+ {
236
+ "input_documents": input_documents,
237
+ "question": question,
238
+ },
239
+ return_only_outputs=True,
240
+ )
241
+ answer = answers["output_text"]
242
+
243
+ # Check again after retrieving from the original sources
244
+ evaluation = self.constitutional_chain.run(question=answer)
245
+ rating = int(evaluation.strip().split()[-1]) # Extract the rating from the returned text
246
+
247
+ if rating < rating_threshold or "I don't know" in answer:
248
+ self.retrieve_alternative_articles(question)
249
+ answers = self.chain(
250
+ {
251
+ "input_documents": input_documents,
252
+ "question": question,
253
+ },
254
+ return_only_outputs=True,
255
+ )
256
+ answer = answers["output_text"]
257
+ else:
258
+ pass
259
+
260
+ self.search_index = None
261
+ self.articles = []
262
+ self.source_urls = []
263
+
264
+ if os.path.exists("search_index.pickle"):
265
+ with open("search_index.pickle", "rb") as f:
266
+ self.search_index = pickle.load(f)
267
+
268
+ input_documents = self.search_index.similarity_search(question, k=4)
269
+
270
+ answers = self.chain(
271
+ {
272
+ "input_documents": input_documents,
273
+ "question": question,
274
+ },
275
+ return_only_outputs=True,
276
+ )
277
+ answer = answers["output_text"]
278
+
279
+ return answer
280
+
281
+
282
+
283
+ def add_pdf_source(self, pdf_text, pdf_filename):
284
+
285
+ self.search_index = None
286
+ self.articles = []
287
+ self.source_urls = []
288
+
289
+ self.articles.append(pdf_text)
290
+ print(pdf_text)
291
+ self.source_urls.append(pdf_filename)
292
+ print(pdf_filename)
293
+ self.update_search_index()
294
+
295
+
296
+ import gradio as gr
297
+ import time
298
+ import tempfile
299
+ import PyPDF2
300
+
301
+ # Create an instance of the ChatbotAssistant class
302
+ assistant = ChatbotAssistant()
303
+
304
+ def process_pdf(file_obj):
305
+ pdf_reader = PyPDF2.PdfReader(file_obj.name)
306
+ num_pages = len(pdf_reader.pages)
307
+ text = ""
308
+
309
+ for page in range(num_pages):
310
+ pdf_page = pdf_reader.pages[page]
311
+ text += pdf_page.extract_text()
312
+
313
+ return text
314
+
315
+ def user(user_message, custom_sources, history, pdf_upload):
316
+ # Update the assistant's sources with the provided custom sources
317
+ if custom_sources:
318
+ assistant.sources = custom_sources.split(', ')
319
+
320
+ # Process the uploaded PDF file and add it to the assistant's sources
321
+ if pdf_upload:
322
+ print("PDF upload is triggered")
323
+ pdf_file_name = os.path.basename(pdf_upload.name)
324
+ pdf_text = process_pdf(pdf_upload)
325
+ assistant.add_pdf_source(pdf_text, pdf_file_name)
326
+
327
+
328
+ return "", custom_sources, history + [(user_message, None)]
329
+
330
+
331
+ def bot(history):
332
+ question = history[-1][0]
333
+ answer = assistant.chatbot_assistant(question)
334
+ history[-1] = (question, answer)
335
+ time.sleep(1)
336
+ return history
337
+
338
+ def copy_last_response(history, saved_responses):
339
+ if history:
340
+ last_response = history[-1][1]
341
+ if saved_responses:
342
+ saved_responses += "\n\n" + last_response
343
+ else:
344
+ saved_responses = last_response
345
+ return saved_responses
346
+
347
+ default_sources = "https://home.kpmg/, https://www.ibisworld.com, https://www.bcg.com/, https://www.mckinsey.com/, https://www2.deloitte.com/, https://www.pwc.co.uk/, https://www.ey.com/en_gl"
348
+
349
+ with gr.Blocks() as demo:
350
+ fn = process_pdf
351
+
352
+ with gr.Row():
353
+ with gr.Column(scale=1, min_width=200):
354
+ custom_sources = gr.Textbox(label="Custom Sources (comma-separated URLs)", value=default_sources, lines=5)
355
+ pdf_upload = gr.File(file_types=[".pdf"], label="Upload PDF")
356
+
357
+ with gr.Column(scale=2, min_width=400):
358
+ chatbot = gr.Chatbot(label="AI Consultant")
359
+ msg = gr.Textbox(label="Your Question")
360
+ submit = gr.Button("Submit")
361
+ clear = gr.Button("Clear History")
362
+ with gr.Column(scale=1, min_width=200):
363
+ copy_button = gr.Button("Copy Last Response")
364
+ saved_responses = gr.Textbox(label="Saved Responses", lines=10)
365
+
366
+ submit.click(user, [msg, custom_sources, chatbot, pdf_upload], [msg, custom_sources, chatbot], queue=False).then(bot, chatbot, chatbot)
367
+ clear.click(lambda: None, None, chatbot, queue=False)
368
+ copy_button.click(copy_last_response, [chatbot, saved_responses], saved_responses, queue=False)
369
+
370
+ demo.launch(debug=True)