Spaces:

CryptoScoutv1
/

CryptoScout_TradeAdvisor

Build error

App Files Files Community

CryptoScoutv1 commited on Feb 18, 2024

Commit

0fc8f83

verified ·

1 Parent(s): 9bed6ed

Create WebScape_ADV.py

Browse files

Files changed (1) hide show

WebScape_ADV.py +69 -0

WebScape_ADV.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import requests
+import undetected_chromedriver as uc
+from langchain.tools import tool
+from bs4 import BeautifulSoup
+from duckduckgo_search import DDGS
+class WebScapeAdv_UC:
+    @tool("process search results with undetectable chrome", return_direct=False)
+    def scrape_with_undetectable_chrome(url: str) -> str:
+        """
+        Scrape webpage content using Selenium with undetectable Chrome driver.
+        :param url: The URL of the webpage to scrape.
+        :return: The text content of the webpage.
+        """
+        try:
+            options = uc.ChromeOptions()
+            options.add_argument('--headless')
+            options.add_argument('--no-sandbox')
+            options.add_argument('--disable-dev-shm-usage')
+            # Initialize undetectable Chrome driver
+            driver = uc.Chrome(options=options)
+            driver.get(url)
+            html = driver.page_source
+            driver.quit()  # Ensure to quit the driver to free resources
+            soup = BeautifulSoup(html, 'html.parser')
+            return soup.get_text()
+        except Exception as e:
+            return f"Failed to fetch content with error: {e}"
+    from bs4 import BeautifulSoup
+    import requests
+    import undetected_chromedriver as uc
+    @tool("process search results with fallback", return_direct=False)
+    def scrape_with_fallback(url: str) -> str:
+        """
+        Attempts to scrape webpage content using BeautifulSoup first, then falls back to Selenium with undetectable Chrome driver if needed.
+        :param url: The URL of the webpage to scrape.
+        :return: The text content of the webpage.
+        """
+        # Try scraping with requests and BeautifulSoup
+        response = requests.get(url)
+        if response.status_code == 200:
+            soup = BeautifulSoup(response.content, 'html.parser')
+            if len(soup.get_text().strip()) > 100:  # Arbitrary threshold of 100 characters
+                return soup.get_text()
+        # If the first attempt fails, fallback to Selenium with undetectable Chrome driver
+        try:
+            options = uc.ChromeOptions()
+            options.add_argument('--headless')
+            options.add_argument('--no-sandbox')
+            options.add_argument('--disable-dev-shm-usage')
+            # Initialize undetectable Chrome driver
+            driver = uc.Chrome(options=options)
+            driver.get(url)
+            html = driver.page_source
+            driver.quit()  # Ensure to quit the driver to free resources
+            soup = BeautifulSoup(html, 'html.parser')
+            return soup.get_text()
+        except Exception as e:
+            return f"Failed to fetch content with error: {e}"