File size: 2,177 Bytes
4ad7a82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from bs4 import BeautifulSoup
from crewai_tools import ScrapeWebsiteTool
from playwright.async_api import async_playwright
from playwright_stealth import stealth_async
from pyppeteer import launch
import asyncio
import base64
import requests


scrape_tool = ScrapeWebsiteTool()


def CustomScrapeWebsiteTool(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept-Language': 'en-US,en;q=0.9',
        'Referer': 'https://www.google.com/',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Accept-Encoding': 'gzip, deflate, br'
    }
    response = requests.get(url, headers=headers)
    parsed = BeautifulSoup(response.content, "html.parser")
    text = parsed.get_text()
    text = '\n'.join([i for i in text.split('\n') if i.strip() != ''])
    text = ' '.join([i for i in text.split(' ') if i.strip() != ''])

    return text


async def AsyncWebpageScreenshot(url):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        stealth_async(page)
        await page.goto(url)

        screenshot_bytes = await page.screenshot(full_page=True)

        await browser.close()

    base64_image = base64.b64encode(screenshot_bytes).decode("utf-8")
    return base64_image


def WebpageScreenshot(url):
    print("Taking screenshot: ", url)
    result = asyncio.run(AsyncWebpageScreenshot(url))
    return result


async def AsyncPyppeteerWebpageScreenshot(url):
    browser = await launch()
    page = await browser.newPage()
    await page.goto(url)
    screenshot_bytes = await page.screenshot()
    await browser.close()

    base64_image = base64.b64encode(screenshot_bytes).decode("utf-8")
    return base64_image

def PyppeteerWebpageScreenshot(url):
    print("Taking screenshot: ", url)
    result = asyncio.run(AsyncPyppeteerWebpageScreenshot(url))
    return result