IVentureISB commited on
Commit
f324b20
·
1 Parent(s): 512ef96

web_scrape_embedding_context

Browse files

It creates embedding from context based on webpages scraped.

Files changed (1) hide show
  1. scrape_create_context.py +327 -0
scrape_create_context.py ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """ISB chatbot.ipynb
3
+
4
+
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1GYmsZSR4MWuvORNpSWFWrXz79lQKb6oc
8
+ """
9
+
10
+ """# Scrape"""
11
+
12
+ # Regex to match a URL
13
+ # HTTP_URL_PATTERN = r'^http[s]{0,1}://.+$'
14
+
15
+ # Define root domain to crawl
16
+ domain = "i-venture.org"
17
+ sitemap_url = "https://i-venture.org/sitemap.xml"
18
+ full_url = "https://i-venture.org/"
19
+
20
+ import os
21
+
22
+ RESULTS_DIR = "scraped_files/"
23
+ os.makedirs(RESULTS_DIR, exist_ok=True)
24
+
25
+ import requests
26
+ import re
27
+ import urllib.request
28
+ from bs4 import BeautifulSoup
29
+ from collections import deque
30
+ from html.parser import HTMLParser
31
+ from urllib.parse import urlparse
32
+ import os
33
+ import pandas as pd
34
+ import numpy as np
35
+
36
+ def get_sitemap(url=sitemap_url):
37
+ try:
38
+ with urllib.request.urlopen(url) as response:
39
+ xml = BeautifulSoup(response,
40
+ 'lxml-xml',
41
+ from_encoding=response.info().get_param('charset'))
42
+
43
+ urls = xml.find_all("url")
44
+ locs = []
45
+
46
+ for url in urls:
47
+
48
+ if xml.find("loc"):
49
+ loc = url.findNext("loc").text
50
+ locs.append(loc)
51
+
52
+ return locs
53
+ except Exception as e:
54
+ print(e)
55
+ return []
56
+
57
+
58
+ def crawl(url):
59
+ # Parse the URL and get the domain
60
+ # local_domain = urlparse(url).netloc
61
+
62
+ queue = deque(get_sitemap())
63
+
64
+ os.makedirs(RESULTS_DIR + "text/", exist_ok=True)
65
+ os.makedirs(RESULTS_DIR + "processed", exist_ok=True)
66
+
67
+ # While the queue is not empty, continue crawling
68
+ while queue:
69
+ # Get the next URL from the queue
70
+ url = queue.pop()
71
+ print(url) # for debugging and to see the progress
72
+
73
+ # Save text from the url to a <url>.txt file
74
+ with open(f'{RESULTS_DIR}text/'+ url.strip("/").replace("/", "_") + ".txt", "w", encoding="UTF-8") as f:
75
+
76
+ soup = BeautifulSoup(requests.get(url).text, "html.parser")
77
+ text = soup.get_text()
78
+
79
+ # If the crawler gets to a page that requires JavaScript, it will stop the crawl
80
+ if ("You need to enable JavaScript to run this app." in text):
81
+ print("Unable to parse page " + url + " due to JavaScript being required")
82
+
83
+ f.write(text)
84
+
85
+ # # Get the hyperlinks from the URL and add them to the queue
86
+ # for link in get_domain_hyperlinks(local_domain, url):
87
+ # if link not in seen:
88
+ # queue.append(link)
89
+ # seen.add(link)
90
+
91
+ def remove_newlines(serie):
92
+ serie = serie.str.replace('\n', ' ')
93
+ serie = serie.str.replace('\\n', ' ')
94
+ serie = serie.str.replace(' ', ' ')
95
+ serie = serie.str.replace(' ', ' ')
96
+ return serie
97
+
98
+
99
+ def get_df():
100
+ # Create a list to store the text files
101
+ texts=[]
102
+
103
+ for file in os.listdir(RESULTS_DIR + "text/"):
104
+ with open(RESULTS_DIR + "text/" + "/" + file, "r", encoding="UTF-8") as f:
105
+ text = f.read()
106
+
107
+ # Omit the first 11 lines and the last 4 lines, then replace -, _, and #update with spaces.
108
+ texts.append((file.replace('#update',''), text))
109
+
110
+ # Create a dataframe from the list of texts
111
+ df = pd.DataFrame(texts, columns = ['fname', 'text'])
112
+
113
+ # Set the text column to be the raw text with the newlines removed
114
+ df['text'] = df.fname + ". " + remove_newlines(df.text)
115
+ return df
116
+
117
+ SCRAPING_DONE = False
118
+ if not SCRAPING_DONE:
119
+ crawl(full_url)
120
+ df = get_df()
121
+ df.to_csv(RESULTS_DIR + 'processed/scraped.csv')
122
+ df.head()
123
+ !zip -r iventure_scrape.zip scraped_files
124
+ else:
125
+ !unzip iventure_scrape.zip
126
+
127
+ """# Create Embeddings
128
+
129
+ ## Clean
130
+ """
131
+
132
+
133
+ import tiktoken
134
+ from openai.embeddings_utils import distances_from_embeddings, cosine_similarity
135
+
136
+ # Load the cl100k_base tokenizer which is designed to work with the ada-002 model
137
+ tokenizer = tiktoken.get_encoding("cl100k_base")
138
+
139
+ df = pd.read_csv(RESULTS_DIR + 'processed/scraped.csv', index_col=0)
140
+ df.columns = ['title', 'text']
141
+
142
+ # Tokenize the text and save the number of tokens to a new column
143
+ df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))
144
+
145
+ # Visualize the distribution of the number of tokens per row using a histogram
146
+ df.n_tokens.hist()
147
+
148
+ max_tokens = 500
149
+
150
+ # Function to split the text into chunks of a maximum number of tokens
151
+ def split_into_many(text, max_tokens = max_tokens):
152
+
153
+ # Split the text into sentences
154
+ sentences = text.split('. ')
155
+
156
+ # Get the number of tokens for each sentence
157
+ n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences]
158
+
159
+ chunks = []
160
+ tokens_so_far = 0
161
+ chunk = []
162
+
163
+ # Loop through the sentences and tokens joined together in a tuple
164
+ for sentence, token in zip(sentences, n_tokens):
165
+
166
+ # If the number of tokens so far plus the number of tokens in the current sentence is greater
167
+ # than the max number of tokens, then add the chunk to the list of chunks and reset
168
+ # the chunk and tokens so far
169
+ if tokens_so_far + token > max_tokens:
170
+ chunks.append(". ".join(chunk) + ".")
171
+ chunk = []
172
+ tokens_so_far = 0
173
+
174
+ # If the number of tokens in the current sentence is greater than the max number of
175
+ # tokens, go to the next sentence
176
+ if token > max_tokens:
177
+ continue
178
+
179
+ # Otherwise, add the sentence to the chunk and add the number of tokens to the total
180
+ chunk.append(sentence)
181
+ tokens_so_far += token + 1
182
+
183
+ # Add the last chunk to the list of chunks
184
+ if chunk:
185
+ chunks.append(". ".join(chunk) + ".")
186
+
187
+ return chunks
188
+
189
+ def shorten(df):
190
+ shortened = []
191
+
192
+ # Loop through the dataframe
193
+ for row in df.iterrows():
194
+
195
+ # If the text is None, go to the next row
196
+ if row[1]['text'] is None:
197
+ continue
198
+
199
+ # If the number of tokens is greater than the max number of tokens, split the text into chunks
200
+ if row[1]['n_tokens'] > max_tokens:
201
+ shortened += split_into_many(row[1]['text'])
202
+
203
+ # Otherwise, add the text to the list of shortened texts
204
+ else:
205
+ shortened.append( row[1]['text'] )
206
+
207
+ new_df = pd.DataFrame(shortened, columns = ['text'])
208
+ new_df['n_tokens'] = new_df.text.apply(lambda x: len(tokenizer.encode(x)))
209
+ return new_df
210
+
211
+ df = shorten(df)
212
+ df.n_tokens.hist()
213
+
214
+ """## Create embeds"""
215
+
216
+
217
+
218
+ import openai
219
+ from dotenv import load_dotenv
220
+ load_dotenv()
221
+
222
+ SECRET_IN_ENV = False
223
+
224
+ def load_api_key():
225
+ with open("secret.txt", "r") as f:
226
+ return f.read()
227
+
228
+ if SECRET_IN_ENV:
229
+ SECRET_TOKEN = os.getenv("SECRET_TOKEN")
230
+ else:
231
+ SECRET_TOKEN = load_api_key()
232
+
233
+ openai.api_key = SECRET_TOKEN
234
+
235
+ # Note that you may run into rate limit issues depending on how many files you try to embed
236
+ # Please check rate limit guide to learn more on how to handle this: https://platform.openai.com/docs/guides/rate-limits
237
+
238
+ df['embeddings'] = df.text.apply(lambda x: openai.Embedding.create(input=x, engine='text-embedding-ada-002')['data'][0]['embedding'])
239
+ df.to_csv('processed/embeddings.csv')
240
+ df.head()
241
+
242
+ """# QnA"""
243
+
244
+ from ast import literal_eval
245
+
246
+ df = pd.read_csv('processed/embeddings.csv', index_col=0)
247
+ df['embeddings'] = df['embeddings'].apply(literal_eval).apply(np.array)
248
+
249
+
250
+ def create_context(
251
+ question, df, max_len=1800, size="ada"
252
+ ):
253
+ """
254
+ Create a context for a question by finding the most similar context from the dataframe
255
+ """
256
+
257
+ # Get the embeddings for the question
258
+ q_embeddings = openai.Embedding.create(input=question, engine='text-embedding-ada-002')['data'][0]['embedding']
259
+
260
+ # Get the distances from the embeddings
261
+ df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values, distance_metric='cosine')
262
+
263
+
264
+ returns = []
265
+ cur_len = 0
266
+
267
+ # Sort by distance and add the text to the context until the context is too long
268
+ for i, row in df.sort_values('distances', ascending=True).iterrows():
269
+
270
+ # Add the length of the text to the current length
271
+ cur_len += row['n_tokens'] + 4
272
+
273
+ # If the context is too long, break
274
+ if cur_len > max_len:
275
+ break
276
+
277
+ # Else add it to the text that is being returned
278
+ returns.append(row["text"])
279
+
280
+ # Return the context
281
+ return "\n\n###\n\n".join(returns)
282
+
283
+ def answer_question(
284
+ df,
285
+ model="text-davinci-003",
286
+ question="Am I allowed to publish model outputs to Twitter, without a human review?",
287
+ max_len=1800,
288
+ size="ada",
289
+ debug=False,
290
+ max_tokens=150,
291
+ stop_sequence=None
292
+ ):
293
+ """
294
+ Answer a question based on the most similar context from the dataframe texts
295
+ """
296
+ context = create_context(
297
+ question,
298
+ df,
299
+ max_len=max_len,
300
+ size=size,
301
+ )
302
+ # If debug, print the raw model response
303
+ if debug:
304
+ print("Context:\n" + context)
305
+ print("\n\n")
306
+
307
+ try:
308
+ # Create a completions using the questin and context
309
+ response = openai.Completion.create(
310
+ prompt=f"Answer the question based on the context below, and if the question can't be answered based on the context, say \"I don't know\"\n\nContext: {context}\n\n---\n\nQuestion: {question}\nAnswer:",
311
+ temperature=0,
312
+ max_tokens=max_tokens,
313
+ top_p=1,
314
+ frequency_penalty=0,
315
+ presence_penalty=0,
316
+ stop=stop_sequence,
317
+ model=model,
318
+ )
319
+ return response["choices"][0]["text"].strip()
320
+ except Exception as e:
321
+ print(e)
322
+ return ""
323
+
324
+ print(answer_question(df, question="What day is it?", debug=False))
325
+
326
+ print(answer_question(df, question="What is our newest embeddings model?"))
327
+