Spaces:

MarkMCase
/

brochure-gen

Build error

brochure-gen / models /website_client.py

Mark

dev updates

abafd11 over 1 year ago

1.68 kB

	from bs4 import BeautifulSoup
	from selenium import webdriver
	from selenium.webdriver.chrome.service import Service
	from selenium.webdriver.chrome.options import Options
	import time
	class Website:
	def __init__(self, url):
	self.url = url
	options = Options()

	# Uncomment the line below to run with a visible browser window
	# options.add_argument("--headless")

	options.add_argument("--no-sandbox")
	options.add_argument("--disable-dev-shm-usage")
	options.add_argument("--headless")

	service = Service()
	driver = webdriver.Chrome(service=service, options=options)
	driver.get(url)

	# Allow more time for potential human verification pages to clear
	#input("Please complete the verification in the browser and press Enter to continue...")
	time.sleep(3)

	page_source = driver.page_source
	driver.quit()
	soup=BeautifulSoup(page_source, 'html.parser')
	self.title = soup.title.string if soup.title else "No title found"
	for irrelevant in soup(["script", "style", "img", "input"]):
	irrelevant.decompose()
	self.text = soup.get_text(separator="\n", strip=True)

	def is_valid_url(self):
	import re
	regex = re.compile(
	r'^https?://' # http:// or https://
	r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?\|' # domain...
	r'localhost\|' # localhost...
	r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
	r'(?::\d+)?' # optional port
	r'(?:/?\|[/?]\S+)$', re.IGNORECASE)
	return self.url is not None and regex.search(self.url)