Spaces:
Build error
Build error
| from bs4 import BeautifulSoup | |
| from selenium import webdriver | |
| from selenium.webdriver.chrome.service import Service | |
| from selenium.webdriver.chrome.options import Options | |
| import time | |
| class Website: | |
| def __init__(self, url): | |
| self.url = url | |
| options = Options() | |
| # Uncomment the line below to run with a visible browser window | |
| # options.add_argument("--headless") | |
| options.add_argument("--no-sandbox") | |
| options.add_argument("--disable-dev-shm-usage") | |
| options.add_argument("--headless") | |
| service = Service() | |
| driver = webdriver.Chrome(service=service, options=options) | |
| driver.get(url) | |
| # Allow more time for potential human verification pages to clear | |
| #input("Please complete the verification in the browser and press Enter to continue...") | |
| time.sleep(3) | |
| page_source = driver.page_source | |
| driver.quit() | |
| soup=BeautifulSoup(page_source, 'html.parser') | |
| self.title = soup.title.string if soup.title else "No title found" | |
| for irrelevant in soup(["script", "style", "img", "input"]): | |
| irrelevant.decompose() | |
| self.text = soup.get_text(separator="\n", strip=True) | |
| def is_valid_url(self): | |
| import re | |
| regex = re.compile( | |
| r'^https?://' # http:// or https:// | |
| r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|' # domain... | |
| r'localhost|' # localhost... | |
| r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip | |
| r'(?::\d+)?' # optional port | |
| r'(?:/?|[/?]\S+)$', re.IGNORECASE) | |
| return self.url is not None and regex.search(self.url) |