CryptoScoutv1 commited on
Commit
0fc8f83
·
verified ·
1 Parent(s): 9bed6ed

Create WebScape_ADV.py

Browse files
Files changed (1) hide show
  1. WebScape_ADV.py +69 -0
WebScape_ADV.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import undetected_chromedriver as uc
3
+
4
+ from langchain.tools import tool
5
+ from bs4 import BeautifulSoup
6
+ from duckduckgo_search import DDGS
7
+
8
+
9
+ class WebScapeAdv_UC:
10
+ @tool("process search results with undetectable chrome", return_direct=False)
11
+ def scrape_with_undetectable_chrome(url: str) -> str:
12
+ """
13
+ Scrape webpage content using Selenium with undetectable Chrome driver.
14
+
15
+ :param url: The URL of the webpage to scrape.
16
+ :return: The text content of the webpage.
17
+ """
18
+ try:
19
+ options = uc.ChromeOptions()
20
+ options.add_argument('--headless')
21
+ options.add_argument('--no-sandbox')
22
+ options.add_argument('--disable-dev-shm-usage')
23
+
24
+ # Initialize undetectable Chrome driver
25
+ driver = uc.Chrome(options=options)
26
+ driver.get(url)
27
+ html = driver.page_source
28
+ driver.quit() # Ensure to quit the driver to free resources
29
+ soup = BeautifulSoup(html, 'html.parser')
30
+ return soup.get_text()
31
+ except Exception as e:
32
+ return f"Failed to fetch content with error: {e}"
33
+ from bs4 import BeautifulSoup
34
+ import requests
35
+ import undetected_chromedriver as uc
36
+
37
+
38
+
39
+ @tool("process search results with fallback", return_direct=False)
40
+ def scrape_with_fallback(url: str) -> str:
41
+ """
42
+ Attempts to scrape webpage content using BeautifulSoup first, then falls back to Selenium with undetectable Chrome driver if needed.
43
+
44
+ :param url: The URL of the webpage to scrape.
45
+ :return: The text content of the webpage.
46
+ """
47
+ # Try scraping with requests and BeautifulSoup
48
+ response = requests.get(url)
49
+ if response.status_code == 200:
50
+ soup = BeautifulSoup(response.content, 'html.parser')
51
+ if len(soup.get_text().strip()) > 100: # Arbitrary threshold of 100 characters
52
+ return soup.get_text()
53
+
54
+ # If the first attempt fails, fallback to Selenium with undetectable Chrome driver
55
+ try:
56
+ options = uc.ChromeOptions()
57
+ options.add_argument('--headless')
58
+ options.add_argument('--no-sandbox')
59
+ options.add_argument('--disable-dev-shm-usage')
60
+
61
+ # Initialize undetectable Chrome driver
62
+ driver = uc.Chrome(options=options)
63
+ driver.get(url)
64
+ html = driver.page_source
65
+ driver.quit() # Ensure to quit the driver to free resources
66
+ soup = BeautifulSoup(html, 'html.parser')
67
+ return soup.get_text()
68
+ except Exception as e:
69
+ return f"Failed to fetch content with error: {e}"