ConversAI / src /components /loaders /websiteCrawler.py
techconsptrs's picture
INITIAL COMMIT
7e24b41
raw
history blame
2.59 kB
from concurrent.futures import ThreadPoolExecutor
from src.utils.exceptions import CustomException
from urllib.parse import urlparse, urljoin
from src.utils.functions import getConfig
from src.utils.functions import cleanText
from src.utils.logging import logger
from bs4 import BeautifulSoup
import time
import requests
class WebsiteCrawler:
def __init__(self):
self.config = getConfig(path = "config.ini")
def getLinksFromPage(self, url: str):
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
anchors = soup.find_all("a")
links = []
for anchor in anchors:
if "href" in anchor.attrs:
if urlparse(anchor.attrs["href"]).netloc == urlparse(url).netloc:
links.append(anchor.attrs["href"])
elif not anchor.attrs["href"].startswith(("//", "file", "javascript", "tel", "mailto", "http")):
links.append(urljoin(url + "/", anchor.attrs["href"]))
else:
pass
links = [link for link in links if "#" not in link]
links = list(set(links))
else:
continue
return links
def getLinks(self, url: str):
try:
logger.info("fetching links from url")
start = time.time()
links = self.getLinksFromPage(url)
uniqueLinks = set()
for link in links:
now = time.time()
if now - start > self.config.getint("WEBCRAWLER", "timeout"):
break
else:
uniqueLinks = uniqueLinks.union(set(self.getLinksFromPage(link)))
return list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in uniqueLinks]))
except Exception as e:
logger.error(CustomException(e))
def extractTextFromUrl(self, url: str):
response = requests.get(url)
response.raise_for_status()
html = response.text
soup = BeautifulSoup(html, 'html.parser')
return cleanText(text = soup.get_text(separator=' ', strip=True))
def extractTextFromUrlList(self, urls: list[str]):
try:
logger.info("extracting text from urls")
with ThreadPoolExecutor() as executor:
texts = list(executor.map(self.extractTextFromUrl, urls))
return "\n".join(texts)
except Exception as e:
logger.error(CustomException(e))