| import time | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urlparse, urljoin | |
| from concurrent.futures import ThreadPoolExecutor | |
| class WebScraper: | |
| def __init__(self): | |
| pass | |
| def get_links(self,url: str, timeout=4): | |
| start = time.time() | |
| def get_links_from_page(url: str) -> list: | |
| response = requests.get(url) | |
| soup = BeautifulSoup(response.content, "lxml") | |
| anchors = soup.find_all("a") | |
| links = [] | |
| for anchor in anchors: | |
| if "href" in anchor.attrs: | |
| if urlparse(anchor.attrs["href"]).netloc == urlparse(url).netloc: | |
| links.append(anchor.attrs["href"]) | |
| elif not anchor.attrs["href"].startswith(("//", "file", "javascript", "tel", "mailto", "http")): | |
| links.append(urljoin(url + "/", anchor.attrs["href"])) | |
| else: | |
| pass | |
| links = [link for link in links if "#" not in link] | |
| links = list(set(links)) | |
| else: | |
| continue | |
| return links | |
| links = get_links_from_page(url) | |
| unique_links = set() | |
| for link in links: | |
| now = time.time() | |
| if now - start > timeout: | |
| break | |
| else: | |
| unique_links = unique_links.union(set(get_links_from_page(link))) | |
| return list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in unique_links])) | |