Spaces:
Runtime error
Runtime error
File size: 2,177 Bytes
4ad7a82 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 | from bs4 import BeautifulSoup
from crewai_tools import ScrapeWebsiteTool
from playwright.async_api import async_playwright
from playwright_stealth import stealth_async
from pyppeteer import launch
import asyncio
import base64
import requests
scrape_tool = ScrapeWebsiteTool()
def CustomScrapeWebsiteTool(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'en-US,en;q=0.9',
'Referer': 'https://www.google.com/',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Accept-Encoding': 'gzip, deflate, br'
}
response = requests.get(url, headers=headers)
parsed = BeautifulSoup(response.content, "html.parser")
text = parsed.get_text()
text = '\n'.join([i for i in text.split('\n') if i.strip() != ''])
text = ' '.join([i for i in text.split(' ') if i.strip() != ''])
return text
async def AsyncWebpageScreenshot(url):
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
stealth_async(page)
await page.goto(url)
screenshot_bytes = await page.screenshot(full_page=True)
await browser.close()
base64_image = base64.b64encode(screenshot_bytes).decode("utf-8")
return base64_image
def WebpageScreenshot(url):
print("Taking screenshot: ", url)
result = asyncio.run(AsyncWebpageScreenshot(url))
return result
async def AsyncPyppeteerWebpageScreenshot(url):
browser = await launch()
page = await browser.newPage()
await page.goto(url)
screenshot_bytes = await page.screenshot()
await browser.close()
base64_image = base64.b64encode(screenshot_bytes).decode("utf-8")
return base64_image
def PyppeteerWebpageScreenshot(url):
print("Taking screenshot: ", url)
result = asyncio.run(AsyncPyppeteerWebpageScreenshot(url))
return result
|