File size: 5,457 Bytes
951ed07
 
 
 
 
04f6f00
db62874
 
951ed07
 
 
 
 
 
 
 
 
 
 
 
04f6f00
 
 
 
 
 
 
951ed07
04f6f00
dda849d
1fe422c
 
 
 
04f6f00
1fe422c
 
 
 
 
 
 
 
 
04f6f00
 
 
951ed07
 
 
db62874
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
04f6f00
951ed07
db62874
951ed07
5d2229e
 
18e7ac6
951ed07
 
 
db62874
951ed07
 
 
 
 
bcad864
951ed07
 
 
 
 
41b4429
951ed07
 
 
 
 
 
 
 
 
 
030a269
 
 
 
 
2de680f
030a269
951ed07
5d2229e
 
951ed07
 
 
 
 
 
 
5d2229e
db0da6b
951ed07
 
 
 
 
 
 
5d2229e
951ed07
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import os
import openai
import gradio as gr
import requests
from bs4 import BeautifulSoup
import urllib.parse
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

try:
    from dotenv import load_dotenv
    load_dotenv()
except ImportError:
    pass  # In production, python-dotenv may not be installed

openai.api_key = os.getenv("OPEN_API_KEY")

class Conversation:
    def __init__(self):
        self.messages = []
    
    # def is_valid_url(self, url):
    #     try:
    #         result = urlparse(url)
    #         return True if all([result.scheme, result.netloc]) else False
    #     except ValueError:
    #         return False

    def to_valid_url(self, input_string):
        print("url: ", input_string)
        try:
            url = input_string.strip()
            if not url:
                raise ValueError("Invalid URL, please try again.")
            parsed_url = urllib.parse.urlparse(url)
            if not all([parsed_url.scheme, parsed_url.netloc]):
                raise ValueError("Invalid URL, please try again.")
            if not parsed_url.scheme:
                url = "https://" + url
                parsed_url = urllib.parse.urlparse(url)
            return parsed_url.geturl()

        except ValueError:
            raise ValueError("Invalid URL, please try again.")


    def get_data(self, old_url):
        # ... your existing get_data implementation ...
        # Replace `messages` with `self.messages`

        def extract_html_content(url):
            response = requests.get(url)
            return response.text

        def extract_js_content(url):
            options = webdriver.ChromeOptions()
            options.add_argument('--headless')
            driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
            driver.get(url)
            rendered_content = driver.page_source
            driver.quit()
            return rendered_content

        def smart_scraper(url):
            html_content = extract_html_content(url)
            selector_to_find = "body"
            
            # Check if the content is incomplete or if a specific tag is missing
            # if not html_content or not html_content.find(selector_to_find):
            if not html_content or not html_content.find(selector_to_find):
                # If incomplete, use Selenium to render JavaScript
                print("Using Selenium for JavaScript rendering...")
                js_content = extract_js_content(url)
                return js_content
            else:
                return html_content

        url = self.to_valid_url(old_url)
        self.messages
        html = smart_scraper(url)
        doc = BeautifulSoup(html, 'html.parser')
        if not doc:
          raise ValueError("Please try again")
        doc = doc.body
        headings_1 = [e.text for e in doc.find_all('h1')]
        headings_2 = [e.text for e in doc.find_all('h2')]
        # headings_3 = [e.text for e in doc.find_all('h3')]
        links = [e.text for e in doc.find_all('a')]
        paragraphs = [e.text for e in doc.find_all('p')]
        # spans = [e.text for e in doc.find_all('span')]
        joined_paragraphs = (' '.join(paragraphs))

        if len(joined_paragraphs) > 7500:
          paragraphs = joined_paragraphs[:3000]

        self.messages = []
        self.messages.append({'role': 'system', 'content': "You are a helpful assistant that must answer questions about a website."})
        self.messages.append({'role': 'system', 'content': f"here are the h1s - {headings_1}"})
        self.messages.append({'role': 'system', 'content': f"here are the h2s - {headings_2}"})
        # self.messages.append({'role': 'system', 'content': f"here are the links - {links}"})
        # messages.append({'role': 'system', 'content': f"here are the h3s - {headings_3}"})
        self.messages.append({'role': 'system', 'content': f"here are the paragraphs - {paragraphs}"})
        # messages.append({'role': 'system', 'content': f"here are the spans - {spans}"})
        return self.messages

    def ask_chatbot(self, input):
        # ... your existing ask_chatbot implementation ...
        # Replace `messages` with `self.messages`
        if input:
            self.messages.append({"role": "user", "content": input})
            try:
                chat = openai.ChatCompletion.create(
                    model="gpt-3.5-turbo", messages=self.messages
                )
            except openai.error.InvalidRequestError:
                raise ValueError("The website is too large to understand. Please try a different site.")
            
            reply = chat.choices[0].message.content
            if not reply:
                raise ValueError("Please try again")
            self.messages.append({"role": "assistant", "content": reply})
            return reply


    def user(self, user_message, history):
        # ... your existing user implementation ...
        # Replace `messages` with `self.messages`
        
        return "", history + [[user_message, None]]

    def bot(self, history):
        # ... your existing bot implementation ...
        # Replace `messages` with `self.messages`
        user_message = history[-1][0]
        try:
          bot_message = self.ask_chatbot(user_message)
        except ValueError:
          bot_message = "Please try again"
        history[-1][1] = bot_message
        return history