Spaces:
Running
Running
File size: 5,457 Bytes
951ed07 04f6f00 db62874 951ed07 04f6f00 951ed07 04f6f00 dda849d 1fe422c 04f6f00 1fe422c 04f6f00 951ed07 db62874 04f6f00 951ed07 db62874 951ed07 5d2229e 18e7ac6 951ed07 db62874 951ed07 bcad864 951ed07 41b4429 951ed07 030a269 2de680f 030a269 951ed07 5d2229e 951ed07 5d2229e db0da6b 951ed07 5d2229e 951ed07 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 | import os
import openai
import gradio as gr
import requests
from bs4 import BeautifulSoup
import urllib.parse
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
try:
from dotenv import load_dotenv
load_dotenv()
except ImportError:
pass # In production, python-dotenv may not be installed
openai.api_key = os.getenv("OPEN_API_KEY")
class Conversation:
def __init__(self):
self.messages = []
# def is_valid_url(self, url):
# try:
# result = urlparse(url)
# return True if all([result.scheme, result.netloc]) else False
# except ValueError:
# return False
def to_valid_url(self, input_string):
print("url: ", input_string)
try:
url = input_string.strip()
if not url:
raise ValueError("Invalid URL, please try again.")
parsed_url = urllib.parse.urlparse(url)
if not all([parsed_url.scheme, parsed_url.netloc]):
raise ValueError("Invalid URL, please try again.")
if not parsed_url.scheme:
url = "https://" + url
parsed_url = urllib.parse.urlparse(url)
return parsed_url.geturl()
except ValueError:
raise ValueError("Invalid URL, please try again.")
def get_data(self, old_url):
# ... your existing get_data implementation ...
# Replace `messages` with `self.messages`
def extract_html_content(url):
response = requests.get(url)
return response.text
def extract_js_content(url):
options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
driver.get(url)
rendered_content = driver.page_source
driver.quit()
return rendered_content
def smart_scraper(url):
html_content = extract_html_content(url)
selector_to_find = "body"
# Check if the content is incomplete or if a specific tag is missing
# if not html_content or not html_content.find(selector_to_find):
if not html_content or not html_content.find(selector_to_find):
# If incomplete, use Selenium to render JavaScript
print("Using Selenium for JavaScript rendering...")
js_content = extract_js_content(url)
return js_content
else:
return html_content
url = self.to_valid_url(old_url)
self.messages
html = smart_scraper(url)
doc = BeautifulSoup(html, 'html.parser')
if not doc:
raise ValueError("Please try again")
doc = doc.body
headings_1 = [e.text for e in doc.find_all('h1')]
headings_2 = [e.text for e in doc.find_all('h2')]
# headings_3 = [e.text for e in doc.find_all('h3')]
links = [e.text for e in doc.find_all('a')]
paragraphs = [e.text for e in doc.find_all('p')]
# spans = [e.text for e in doc.find_all('span')]
joined_paragraphs = (' '.join(paragraphs))
if len(joined_paragraphs) > 7500:
paragraphs = joined_paragraphs[:3000]
self.messages = []
self.messages.append({'role': 'system', 'content': "You are a helpful assistant that must answer questions about a website."})
self.messages.append({'role': 'system', 'content': f"here are the h1s - {headings_1}"})
self.messages.append({'role': 'system', 'content': f"here are the h2s - {headings_2}"})
# self.messages.append({'role': 'system', 'content': f"here are the links - {links}"})
# messages.append({'role': 'system', 'content': f"here are the h3s - {headings_3}"})
self.messages.append({'role': 'system', 'content': f"here are the paragraphs - {paragraphs}"})
# messages.append({'role': 'system', 'content': f"here are the spans - {spans}"})
return self.messages
def ask_chatbot(self, input):
# ... your existing ask_chatbot implementation ...
# Replace `messages` with `self.messages`
if input:
self.messages.append({"role": "user", "content": input})
try:
chat = openai.ChatCompletion.create(
model="gpt-3.5-turbo", messages=self.messages
)
except openai.error.InvalidRequestError:
raise ValueError("The website is too large to understand. Please try a different site.")
reply = chat.choices[0].message.content
if not reply:
raise ValueError("Please try again")
self.messages.append({"role": "assistant", "content": reply})
return reply
def user(self, user_message, history):
# ... your existing user implementation ...
# Replace `messages` with `self.messages`
return "", history + [[user_message, None]]
def bot(self, history):
# ... your existing bot implementation ...
# Replace `messages` with `self.messages`
user_message = history[-1][0]
try:
bot_message = self.ask_chatbot(user_message)
except ValueError:
bot_message = "Please try again"
history[-1][1] = bot_message
return history |