hsg_rag_eea / src /scraping /url_normalizer.py
Pygmales
synched versions
698965e
raw
history blame
1.3 kB
import re
from urllib.parse import urlsplit, urlparse
from ..const.page_blacklist import *
class UrlNormalizer:
@staticmethod
def is_url_blacklisted(url: str) -> bool:
url_lower = url.lower()
path = url_lower.split('://', 1)[-1].split('/', 1)[-1]
for forbidden in PAGE_BLACKLIST:
if forbidden in path:
return True
return False
@staticmethod
def url_to_filename(url: str) -> str:
parsed = urlparse(url)
# Build base from netloc + path
filename = f"{parsed.netloc}{parsed.path}"
# Remove leading/trailing slashes
filename = filename.strip('/')
# Replace separators
filename = filename.replace('/', '_').replace('.', '-')
# Remove all problematic characters
filename = re.sub(r'[^a-zA-Z0-9_-]', '_', filename)
return filename
def filter_discovered_urls(self, discovered_urls, visited_urls, target_domain) -> list:
filtered_urls = set()
for url in discovered_urls:
if any([self.is_url_blacklisted(url), url in visited_urls, urlsplit(url).netloc != target_domain]):
continue
filtered_urls.add(url)
return filtered_urls