Spaces:
Sleeping
Sleeping
File size: 9,670 Bytes
f324b20 b043715 f324b20 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 |
# -*- coding: utf-8 -*-
"""ISB chatbot.ipynb
Original file is located at
https://colab.research.google.com/drive/1GYmsZSR4MWuvORNpSWFWrXz79lQKb6oc
"""
"""# Scrape"""
# Regex to match a URL
# HTTP_URL_PATTERN = r'^http[s]{0,1}://.+$'
# Define root domain to crawl
domain = "i-venture.org"
sitemap_url = "https://i-venture.org/sitemap.xml"
full_url = "https://i-venture.org/"
import os
RESULTS_DIR = "scraped_files/"
os.makedirs(RESULTS_DIR, exist_ok=True)
import requests
import re
import urllib.request
from bs4 import BeautifulSoup
from collections import deque
from html.parser import HTMLParser
from urllib.parse import urlparse
import os
import pandas as pd
import numpy as np
def get_sitemap(url=sitemap_url):
try:
with urllib.request.urlopen(url) as response:
xml = BeautifulSoup(response,
'lxml-xml',
from_encoding=response.info().get_param('charset'))
urls = xml.find_all("url")
locs = []
for url in urls:
if xml.find("loc"):
loc = url.findNext("loc").text
locs.append(loc)
return locs
except Exception as e:
print(e)
return []
def crawl(url):
# Parse the URL and get the domain
# local_domain = urlparse(url).netloc
queue = deque(get_sitemap())
os.makedirs(RESULTS_DIR + "text/", exist_ok=True)
os.makedirs(RESULTS_DIR + "processed", exist_ok=True)
# While the queue is not empty, continue crawling
while queue:
# Get the next URL from the queue
url = queue.pop()
print(url) # for debugging and to see the progress
# Save text from the url to a <url>.txt file
with open(f'{RESULTS_DIR}text/'+ url.strip("/").replace("/", "_") + ".txt", "w", encoding="UTF-8") as f:
soup = BeautifulSoup(requests.get(url).text, "html.parser")
text = soup.get_text()
# If the crawler gets to a page that requires JavaScript, it will stop the crawl
if ("You need to enable JavaScript to run this app." in text):
print("Unable to parse page " + url + " due to JavaScript being required")
f.write(text)
# # Get the hyperlinks from the URL and add them to the queue
# for link in get_domain_hyperlinks(local_domain, url):
# if link not in seen:
# queue.append(link)
# seen.add(link)
def remove_newlines(serie):
serie = serie.str.replace('\n', ' ')
serie = serie.str.replace('\\n', ' ')
serie = serie.str.replace(' ', ' ')
serie = serie.str.replace(' ', ' ')
return serie
def get_df():
# Create a list to store the text files
texts=[]
for file in os.listdir(RESULTS_DIR + "text/"):
with open(RESULTS_DIR + "text/" + "/" + file, "r", encoding="UTF-8") as f:
text = f.read()
# Omit the first 11 lines and the last 4 lines, then replace -, _, and #update with spaces.
texts.append((file.replace('#update',''), text))
# Create a dataframe from the list of texts
df = pd.DataFrame(texts, columns = ['fname', 'text'])
# Set the text column to be the raw text with the newlines removed
df['text'] = df.fname + ". " + remove_newlines(df.text)
return df
SCRAPING_DONE = False
if not SCRAPING_DONE:
crawl(full_url)
df = get_df()
df.to_csv(RESULTS_DIR + 'processed/scraped.csv')
df.head()
!zip -r iventure_scrape.zip scraped_files
else:
!unzip iventure_scrape.zip
"""# Create Embeddings
## Clean
"""
import tiktoken
from openai.embeddings_utils import distances_from_embeddings, cosine_similarity
# Load the cl100k_base tokenizer which is designed to work with the ada-002 model
tokenizer = tiktoken.get_encoding("cl100k_base")
df = pd.read_csv(RESULTS_DIR + 'processed/scraped.csv', index_col=0)
df.columns = ['title', 'text']
# Tokenize the text and save the number of tokens to a new column
df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))
# Visualize the distribution of the number of tokens per row using a histogram
df.n_tokens.hist()
max_tokens = 500
# Function to split the text into chunks of a maximum number of tokens
def split_into_many(text, max_tokens = max_tokens):
# Split the text into sentences
sentences = text.split('. ')
# Get the number of tokens for each sentence
n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences]
chunks = []
tokens_so_far = 0
chunk = []
# Loop through the sentences and tokens joined together in a tuple
for sentence, token in zip(sentences, n_tokens):
# If the number of tokens so far plus the number of tokens in the current sentence is greater
# than the max number of tokens, then add the chunk to the list of chunks and reset
# the chunk and tokens so far
if tokens_so_far + token > max_tokens:
chunks.append(". ".join(chunk) + ".")
chunk = []
tokens_so_far = 0
# If the number of tokens in the current sentence is greater than the max number of
# tokens, go to the next sentence
if token > max_tokens:
continue
# Otherwise, add the sentence to the chunk and add the number of tokens to the total
chunk.append(sentence)
tokens_so_far += token + 1
# Add the last chunk to the list of chunks
if chunk:
chunks.append(". ".join(chunk) + ".")
return chunks
def shorten(df):
shortened = []
# Loop through the dataframe
for row in df.iterrows():
# If the text is None, go to the next row
if row[1]['text'] is None:
continue
# If the number of tokens is greater than the max number of tokens, split the text into chunks
if row[1]['n_tokens'] > max_tokens:
shortened += split_into_many(row[1]['text'])
# Otherwise, add the text to the list of shortened texts
else:
shortened.append( row[1]['text'] )
new_df = pd.DataFrame(shortened, columns = ['text'])
new_df['n_tokens'] = new_df.text.apply(lambda x: len(tokenizer.encode(x)))
return new_df
df = shorten(df)
df.n_tokens.hist()
"""## Create embeds"""
import openai
from dotenv import load_dotenv
load_dotenv()
SECRET_IN_ENV = False
import os
SECRET_TOKEN = os.getenv("SECRET_TOKEN")
def load_api_key():
with open("secret.txt", "r") as f:
return f.read()
if SECRET_IN_ENV:
SECRET_TOKEN = os.getenv("SECRET_TOKEN")
else:
SECRET_TOKEN = load_api_key()
openai.api_key = SECRET_TOKEN
# Note that you may run into rate limit issues depending on how many files you try to embed
# Please check rate limit guide to learn more on how to handle this: https://platform.openai.com/docs/guides/rate-limits
df['embeddings'] = df.text.apply(lambda x: openai.Embedding.create(input=x, engine='text-embedding-ada-002')['data'][0]['embedding'])
df.to_csv('processed/embeddings.csv')
df.head()
"""# QnA"""
from ast import literal_eval
df = pd.read_csv('processed/embeddings.csv', index_col=0)
df['embeddings'] = df['embeddings'].apply(literal_eval).apply(np.array)
def create_context(
question, df, max_len=1800, size="ada"
):
"""
Create a context for a question by finding the most similar context from the dataframe
"""
# Get the embeddings for the question
q_embeddings = openai.Embedding.create(input=question, engine='text-embedding-ada-002')['data'][0]['embedding']
# Get the distances from the embeddings
df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values, distance_metric='cosine')
returns = []
cur_len = 0
# Sort by distance and add the text to the context until the context is too long
for i, row in df.sort_values('distances', ascending=True).iterrows():
# Add the length of the text to the current length
cur_len += row['n_tokens'] + 4
# If the context is too long, break
if cur_len > max_len:
break
# Else add it to the text that is being returned
returns.append(row["text"])
# Return the context
return "\n\n###\n\n".join(returns)
def answer_question(
df,
model="text-davinci-003",
question="Am I allowed to publish model outputs to Twitter, without a human review?",
max_len=1800,
size="ada",
debug=False,
max_tokens=150,
stop_sequence=None
):
"""
Answer a question based on the most similar context from the dataframe texts
"""
context = create_context(
question,
df,
max_len=max_len,
size=size,
)
# If debug, print the raw model response
if debug:
print("Context:\n" + context)
print("\n\n")
try:
# Create a completions using the questin and context
response = openai.Completion.create(
prompt=f"Answer the question based on the context below, and if the question can't be answered based on the context, say \"I don't know\"\n\nContext: {context}\n\n---\n\nQuestion: {question}\nAnswer:",
temperature=0,
max_tokens=max_tokens,
top_p=1,
frequency_penalty=0,
presence_penalty=0,
stop=stop_sequence,
model=model,
)
return response["choices"][0]["text"].strip()
except Exception as e:
print(e)
return ""
print(answer_question(df, question="What day is it?", debug=False))
print(answer_question(df, question="What is our newest embeddings model?"))
|