File size: 6,748 Bytes
ed88065
 
 
 
 
 
 
 
 
 
 
 
 
 
8537d45
ed88065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8537d45
 
ed88065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8537d45
 
 
ed88065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
import time
import streamlit as st
import asyncio
from multiprocessing import Pool
import asyncio
from pyppeteer import launch
import base64
import os
from openai import OpenAI
import json


#################### OpenAI ####################
api_key = os.getenv("OPENAI_API_KEY")

if not api_key:
    with st.sidebar:
        api_key = st.text_input("Please enter your OpenAI API key")


#################### Functions ####################

# Define the function to be run in a separate process since it is async
def run_pyppeteer(url):
    # Start the event loop and run the async function
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    result = loop.run_until_complete(capture_screenshots(url))
    loop.close()
    return result

# Get the url from prompt using GPT3.5
def get_url_from_prompt(prompt):

    model = OpenAI(api_key=api_key)
    model.timeout = 30
    messages = [
        {
            "role": "system",
            "content": "You are a web crawler. Your job is to give the user a URL to go to in order to find the answer to the question. Go to a direct URL that will likely have the answer to the user's question. Respond in the following JSON fromat: {\"url\": \"<put url here>\"}",
        },
        {
            "role": "user",
            "content": prompt,
        }
    ]

    response = model.chat.completions.create(
        model="gpt-3.5-turbo-1106",
        messages=messages,
        max_tokens=1024,
        response_format={"type": "json_object"},
        seed=2232,
    )

    message = response.choices[0].message
    message_json = json.loads(message.content)
    url = message_json["url"]

    messages.append({
        "role": "assistant",
        "content": message.content,
    })

    return url

# Capture screenshots from url using pyppeteer
async def capture_screenshots(url):

        browser = await launch()
        page = await browser.newPage()

        await page.setViewport({'width': 1920, 'height': 1080})
        await page.goto(url)
        screenshots_dict = {}
        screenshot_counter = 1

        while True:
            screenshot = await page.screenshot()
            screenshot_base64 = base64.b64encode(screenshot).decode('utf-8')
            screenshot_key = f"screenshot_{screenshot_counter}"
            screenshots_dict[screenshot_key] = screenshot_base64

            await page.evaluate("window.scrollBy(0, window.innerHeight)")
            scroll_top = await page.evaluate("window.scrollY")
            scroll_height = await page.evaluate("document.body.scrollHeight")
            inner_height = await page.evaluate("window.innerHeight")

            if scroll_top + inner_height >= scroll_height:
                break

            screenshot_counter += 1

        await browser.close()
        return screenshots_dict
  
def scrape_images_using_gtpVision(prompt, screenshot):

    model = OpenAI(api_key=api_key)
    model.timeout = 30

    messages = [
		{
			"role": "user",
			"content": prompt,
		}
	]
    response = model.chat.completions.create(
		model="gpt-4-vision-preview",
		messages=[
			{
				"role": "system",
				"content": "Your job is to answer the user's question based on the given screenshot only with more than 95% certainty. Answer the user as an assistant, but don't tell that the information is from a screenshot or an image. Pretend it is information that you know. If you can't answer the question, dont look for another answer, simply respond with the code `ANSWER_NOT_FOUND` and nothing else. Thats extreame important that you dont try to find the answer from another source",
			}
		] + messages[1:] + [
			{
				"role": "user",
				"content": [
					{
						"type": "image_url",
						"image_url": f"data:image/png;base64,{screenshot}",
					},
					{
						"type": "text",
						"text": prompt,
					}
				]
			}
		],
		max_tokens=1024,
	)
    message = response.choices[0].message
	
    return  message.content

#################### Streamlit app ####################
def app():
    st.title("๐Ÿ’ฌ Virtu.web")
    st.caption("๐Ÿš€ A web scraper using AI (GPT4Vision + Pyppeteer) by Virtu.ai")

    prompt = st.text_input("Your question?", key="search_q")
    if prompt:

        # Check if API key is provided
        if not api_key:
            st.error("Please enter your OpenAI API key or set it as an environment variable")
            st.code("Add the line below to ~/.bashrc or ~/.zshrc" + "\n" + "export OPENAI_API_KEY=your_openai_key" + "\n" + "source ~/.bashrc or source ~/.zshrc")
            st.stop()
            
        with st.status("๐Ÿค– I'm looking for the answer... Follow the process in the meantime", expanded=True) as status:
            
            start_time = time.time()
            
            st.write("Searching for a url using gpt3.5...")
            url = get_url_from_prompt(prompt)
            st.write("Found URL " + url)

            st.write("Retrieving screenshots from url using pyppeteer...")
            # Create a pool of processes
            pool = Pool(processes=1)
            result = pool.apply_async(run_pyppeteer, (url,))
            # Continue with other tasks while waiting for the subprocess to finish
            while not result.ready():
                time.sleep(1)  # You can adjust the sleep interval as needed
            # Get the result from the subprocess
            screenshots_dict = result.get()
            st.write("Screenshots retrieved")

            st.write("Processing images using GPT4Vision...")           
            for i, screenshot in enumerate(screenshots_dict.values(), start=1):

                # Now you can process the result and continue with your Streamlit code
                gptVision_return_message = scrape_images_using_gtpVision(prompt, screenshot)
                st.write("Scraping image using GPT4Vision... Image " + str(i) + " of " + str(len(screenshots_dict)) + " processed")

                binary_image = base64.b64decode(screenshot)

                if "ANSWER_NOT_FOUND" in gptVision_return_message:
                    st.write("Answer not found on current screenshot below, I'll keep looking in the same website")
                    st.image(binary_image)
                    continue
                else:
                    st.write("Answer found on screenshot below")
                    st.image(binary_image)

                st.session_state["messages"] = [{"role": "assistant", "content": gptVision_return_message}]
                
                end_time = time.time()

                status.update(label="Search took - {:.2f} seconds".format(end_time - start_time), state="complete", expanded=False)

                break
            
        st.write(gptVision_return_message)    
    
app()