Spaces:
Running
Running
File size: 1,303 Bytes
698965e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 | import re
from urllib.parse import urlsplit, urlparse
from ..const.page_blacklist import *
class UrlNormalizer:
@staticmethod
def is_url_blacklisted(url: str) -> bool:
url_lower = url.lower()
path = url_lower.split('://', 1)[-1].split('/', 1)[-1]
for forbidden in PAGE_BLACKLIST:
if forbidden in path:
return True
return False
@staticmethod
def url_to_filename(url: str) -> str:
parsed = urlparse(url)
# Build base from netloc + path
filename = f"{parsed.netloc}{parsed.path}"
# Remove leading/trailing slashes
filename = filename.strip('/')
# Replace separators
filename = filename.replace('/', '_').replace('.', '-')
# Remove all problematic characters
filename = re.sub(r'[^a-zA-Z0-9_-]', '_', filename)
return filename
def filter_discovered_urls(self, discovered_urls, visited_urls, target_domain) -> list:
filtered_urls = set()
for url in discovered_urls:
if any([self.is_url_blacklisted(url), url in visited_urls, urlsplit(url).netloc != target_domain]):
continue
filtered_urls.add(url)
return filtered_urls
|