brochure-gen / models /website_client.py
Mark
dev updates
abafd11
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import time
class Website:
def __init__(self, url):
self.url = url
options = Options()
# Uncomment the line below to run with a visible browser window
# options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--headless")
service = Service()
driver = webdriver.Chrome(service=service, options=options)
driver.get(url)
# Allow more time for potential human verification pages to clear
#input("Please complete the verification in the browser and press Enter to continue...")
time.sleep(3)
page_source = driver.page_source
driver.quit()
soup=BeautifulSoup(page_source, 'html.parser')
self.title = soup.title.string if soup.title else "No title found"
for irrelevant in soup(["script", "style", "img", "input"]):
irrelevant.decompose()
self.text = soup.get_text(separator="\n", strip=True)
def is_valid_url(self):
import re
regex = re.compile(
r'^https?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|' # domain...
r'localhost|' # localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
return self.url is not None and regex.search(self.url)