# -*- coding: utf-8 -*- """ISB chatbot.ipynb Original file is located at https://colab.research.google.com/drive/1GYmsZSR4MWuvORNpSWFWrXz79lQKb6oc """ """# Scrape""" # Regex to match a URL # HTTP_URL_PATTERN = r'^http[s]{0,1}://.+$' # Define root domain to crawl domain = "i-venture.org" sitemap_url = "https://i-venture.org/sitemap.xml" full_url = "https://i-venture.org/" import os RESULTS_DIR = "scraped_files/" os.makedirs(RESULTS_DIR, exist_ok=True) import requests import re import urllib.request from bs4 import BeautifulSoup from collections import deque from html.parser import HTMLParser from urllib.parse import urlparse import os import pandas as pd import numpy as np def get_sitemap(url=sitemap_url): try: with urllib.request.urlopen(url) as response: xml = BeautifulSoup(response, 'lxml-xml', from_encoding=response.info().get_param('charset')) urls = xml.find_all("url") locs = [] for url in urls: if xml.find("loc"): loc = url.findNext("loc").text locs.append(loc) return locs except Exception as e: print(e) return [] def crawl(url): # Parse the URL and get the domain # local_domain = urlparse(url).netloc queue = deque(get_sitemap()) os.makedirs(RESULTS_DIR + "text/", exist_ok=True) os.makedirs(RESULTS_DIR + "processed", exist_ok=True) # While the queue is not empty, continue crawling while queue: # Get the next URL from the queue url = queue.pop() print(url) # for debugging and to see the progress # Save text from the url to a .txt file with open(f'{RESULTS_DIR}text/'+ url.strip("/").replace("/", "_") + ".txt", "w", encoding="UTF-8") as f: soup = BeautifulSoup(requests.get(url).text, "html.parser") text = soup.get_text() # If the crawler gets to a page that requires JavaScript, it will stop the crawl if ("You need to enable JavaScript to run this app." in text): print("Unable to parse page " + url + " due to JavaScript being required") f.write(text) # # Get the hyperlinks from the URL and add them to the queue # for link in get_domain_hyperlinks(local_domain, url): # if link not in seen: # queue.append(link) # seen.add(link) def remove_newlines(serie): serie = serie.str.replace('\n', ' ') serie = serie.str.replace('\\n', ' ') serie = serie.str.replace(' ', ' ') serie = serie.str.replace(' ', ' ') return serie def get_df(): # Create a list to store the text files texts=[] for file in os.listdir(RESULTS_DIR + "text/"): with open(RESULTS_DIR + "text/" + "/" + file, "r", encoding="UTF-8") as f: text = f.read() # Omit the first 11 lines and the last 4 lines, then replace -, _, and #update with spaces. texts.append((file.replace('#update',''), text)) # Create a dataframe from the list of texts df = pd.DataFrame(texts, columns = ['fname', 'text']) # Set the text column to be the raw text with the newlines removed df['text'] = df.fname + ". " + remove_newlines(df.text) return df SCRAPING_DONE = False if not SCRAPING_DONE: crawl(full_url) df = get_df() df.to_csv(RESULTS_DIR + 'processed/scraped.csv') df.head() !zip -r iventure_scrape.zip scraped_files else: !unzip iventure_scrape.zip """# Create Embeddings ## Clean """ import tiktoken from openai.embeddings_utils import distances_from_embeddings, cosine_similarity # Load the cl100k_base tokenizer which is designed to work with the ada-002 model tokenizer = tiktoken.get_encoding("cl100k_base") df = pd.read_csv(RESULTS_DIR + 'processed/scraped.csv', index_col=0) df.columns = ['title', 'text'] # Tokenize the text and save the number of tokens to a new column df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x))) # Visualize the distribution of the number of tokens per row using a histogram df.n_tokens.hist() max_tokens = 500 # Function to split the text into chunks of a maximum number of tokens def split_into_many(text, max_tokens = max_tokens): # Split the text into sentences sentences = text.split('. ') # Get the number of tokens for each sentence n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences] chunks = [] tokens_so_far = 0 chunk = [] # Loop through the sentences and tokens joined together in a tuple for sentence, token in zip(sentences, n_tokens): # If the number of tokens so far plus the number of tokens in the current sentence is greater # than the max number of tokens, then add the chunk to the list of chunks and reset # the chunk and tokens so far if tokens_so_far + token > max_tokens: chunks.append(". ".join(chunk) + ".") chunk = [] tokens_so_far = 0 # If the number of tokens in the current sentence is greater than the max number of # tokens, go to the next sentence if token > max_tokens: continue # Otherwise, add the sentence to the chunk and add the number of tokens to the total chunk.append(sentence) tokens_so_far += token + 1 # Add the last chunk to the list of chunks if chunk: chunks.append(". ".join(chunk) + ".") return chunks def shorten(df): shortened = [] # Loop through the dataframe for row in df.iterrows(): # If the text is None, go to the next row if row[1]['text'] is None: continue # If the number of tokens is greater than the max number of tokens, split the text into chunks if row[1]['n_tokens'] > max_tokens: shortened += split_into_many(row[1]['text']) # Otherwise, add the text to the list of shortened texts else: shortened.append( row[1]['text'] ) new_df = pd.DataFrame(shortened, columns = ['text']) new_df['n_tokens'] = new_df.text.apply(lambda x: len(tokenizer.encode(x))) return new_df df = shorten(df) df.n_tokens.hist() """## Create embeds""" import openai from dotenv import load_dotenv load_dotenv() SECRET_IN_ENV = False import os SECRET_TOKEN = os.getenv("SECRET_TOKEN") def load_api_key(): with open("secret.txt", "r") as f: return f.read() if SECRET_IN_ENV: SECRET_TOKEN = os.getenv("SECRET_TOKEN") else: SECRET_TOKEN = load_api_key() openai.api_key = SECRET_TOKEN # Note that you may run into rate limit issues depending on how many files you try to embed # Please check rate limit guide to learn more on how to handle this: https://platform.openai.com/docs/guides/rate-limits df['embeddings'] = df.text.apply(lambda x: openai.Embedding.create(input=x, engine='text-embedding-ada-002')['data'][0]['embedding']) df.to_csv('processed/embeddings.csv') df.head() """# QnA""" from ast import literal_eval df = pd.read_csv('processed/embeddings.csv', index_col=0) df['embeddings'] = df['embeddings'].apply(literal_eval).apply(np.array) def create_context( question, df, max_len=1800, size="ada" ): """ Create a context for a question by finding the most similar context from the dataframe """ # Get the embeddings for the question q_embeddings = openai.Embedding.create(input=question, engine='text-embedding-ada-002')['data'][0]['embedding'] # Get the distances from the embeddings df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values, distance_metric='cosine') returns = [] cur_len = 0 # Sort by distance and add the text to the context until the context is too long for i, row in df.sort_values('distances', ascending=True).iterrows(): # Add the length of the text to the current length cur_len += row['n_tokens'] + 4 # If the context is too long, break if cur_len > max_len: break # Else add it to the text that is being returned returns.append(row["text"]) # Return the context return "\n\n###\n\n".join(returns) def answer_question( df, model="text-davinci-003", question="Am I allowed to publish model outputs to Twitter, without a human review?", max_len=1800, size="ada", debug=False, max_tokens=150, stop_sequence=None ): """ Answer a question based on the most similar context from the dataframe texts """ context = create_context( question, df, max_len=max_len, size=size, ) # If debug, print the raw model response if debug: print("Context:\n" + context) print("\n\n") try: # Create a completions using the questin and context response = openai.Completion.create( prompt=f"Answer the question based on the context below, and if the question can't be answered based on the context, say \"I don't know\"\n\nContext: {context}\n\n---\n\nQuestion: {question}\nAnswer:", temperature=0, max_tokens=max_tokens, top_p=1, frequency_penalty=0, presence_penalty=0, stop=stop_sequence, model=model, ) return response["choices"][0]["text"].strip() except Exception as e: print(e) return "" print(answer_question(df, question="What day is it?", debug=False)) print(answer_question(df, question="What is our newest embeddings model?"))