Spaces:

techconsptr
/

ConversAI

Sleeping

App Files Files Community

ConversAI / src /components /loaders /websiteCrawler.py

techconsptrs

UPDATE: code update

1802405 over 1 year ago

raw

history blame contribute delete

3.56 kB

	from concurrent.futures import ThreadPoolExecutor
	from src.utils.exceptions import CustomException
	from urllib.parse import urlparse, urljoin
	from src.utils.functions import getConfig, cleanText
	from src.utils.logging import logger
	from bs4 import BeautifulSoup
	import time
	import requests

	class WebsiteCrawler:
	def __init__(self):
	"""Initialize the WebsiteCrawler with configuration settings."""
	self.config = getConfig(path="config.ini")

	def getLinksFromPage(self, url: str) -> list[str]:
	"""
	Extract all valid links from a given webpage.

	Args:
	url (str): The URL of the webpage to extract links from.

	Returns:
	list[str]: A list of extracted links from the page.
	"""
	response = requests.get(url)
	soup = BeautifulSoup(response.content, "html.parser")
	anchors = soup.find_all("a")
	links = []

	for anchor in anchors:
	if "href" in anchor.attrs:
	if urlparse(anchor.attrs["href"]).netloc == urlparse(url).netloc:
	links.append(anchor.attrs["href"])
	elif not anchor.attrs["href"].startswith(("//", "file", "javascript", "tel", "mailto", "http")):
	links.append(urljoin(url + "/", anchor.attrs["href"]))

	links = [link for link in links if "#" not in link]
	links = list(set(links))

	return links

	def getLinks(self, url: str) -> list[str]:
	"""
	Fetch and return all unique links found from the given URL.

	Args:
	url (str): The starting URL to fetch links from.

	Returns:
	list[str]: A list of unique links found.
	"""
	try:
	logger.info("Fetching links from URL")
	start = time.time()
	links = self.getLinksFromPage(url)
	uniqueLinks = set()

	for link in links:
	now = time.time()
	if now - start > self.config.getint("WEBCRAWLER", "timeout"):
	break
	uniqueLinks = uniqueLinks.union(set(self.getLinksFromPage(link)))

	return list(set([x[:-1] if x[-1] == "/" else x for x in uniqueLinks]))
	except Exception as e:
	logger.error(CustomException(e))

	def extractTextFromUrl(self, url: str) -> str:
	"""
	Extract and clean text content from a given URL.

	Args:
	url (str): The URL of the webpage to extract text from.

	Returns:
	str: Cleaned text extracted from the webpage.
	"""
	response = requests.get(url)
	response.raise_for_status()
	html = response.text
	soup = BeautifulSoup(html, 'html.parser')
	return cleanText(text=soup.get_text(separator=' ', strip=True))

	def extractTextFromUrlList(self, urls: list[str]) -> str:
	"""
	Extract text from a list of URLs concurrently.

	Args:
	urls (list[str]): A list of URLs to extract text from.

	Returns:
	str: All extracted text combined into a single string.
	"""
	try:
	logger.info("Extracting text from URLs")
	with ThreadPoolExecutor() as executor:
	texts = list(executor.map(self.extractTextFromUrl, urls))
	return "\n".join(texts)
	except Exception as e:
	logger.error(CustomException(e))