Aluminum

Paused

Aluminum / backend /open_webui /retrieval /web /utils.py

github-actions[bot]

GitHub deploy: 98f3b3200a726a9eee1f707bdd2850c746a18889

81e42f7 about 1 year ago

21.9 kB

	import asyncio
	import logging
	import socket
	import ssl
	import urllib.parse
	import urllib.request
	from collections import defaultdict
	from datetime import datetime, time, timedelta
	from typing import (
	Any,
	AsyncIterator,
	Dict,
	Iterator,
	List,
	Optional,
	Sequence,
	Union,
	Literal,
	)
	import aiohttp
	import certifi
	import validators
	from langchain_community.document_loaders import PlaywrightURLLoader, WebBaseLoader
	from langchain_community.document_loaders.firecrawl import FireCrawlLoader
	from langchain_community.document_loaders.base import BaseLoader
	from langchain_core.documents import Document
	from open_webui.constants import ERROR_MESSAGES
	from open_webui.config import (
	ENABLE_RAG_LOCAL_WEB_FETCH,
	PLAYWRIGHT_WS_URI,
	RAG_WEB_LOADER_ENGINE,
	FIRECRAWL_API_BASE_URL,
	FIRECRAWL_API_KEY,
	)
	from open_webui.env import SRC_LOG_LEVELS

	log = logging.getLogger(__name__)
	log.setLevel(SRC_LOG_LEVELS["RAG"])


	def validate_url(url: Union[str, Sequence[str]]):
	if isinstance(url, str):
	if isinstance(validators.url(url), validators.ValidationError):
	raise ValueError(ERROR_MESSAGES.INVALID_URL)
	if not ENABLE_RAG_LOCAL_WEB_FETCH:
	# Local web fetch is disabled, filter out any URLs that resolve to private IP addresses
	parsed_url = urllib.parse.urlparse(url)
	# Get IPv4 and IPv6 addresses
	ipv4_addresses, ipv6_addresses = resolve_hostname(parsed_url.hostname)
	# Check if any of the resolved addresses are private
	# This is technically still vulnerable to DNS rebinding attacks, as we don't control WebBaseLoader
	for ip in ipv4_addresses:
	if validators.ipv4(ip, private=True):
	raise ValueError(ERROR_MESSAGES.INVALID_URL)
	for ip in ipv6_addresses:
	if validators.ipv6(ip, private=True):
	raise ValueError(ERROR_MESSAGES.INVALID_URL)
	return True
	elif isinstance(url, Sequence):
	return all(validate_url(u) for u in url)
	else:
	return False


	def safe_validate_urls(url: Sequence[str]) -> Sequence[str]:
	valid_urls = []
	for u in url:
	try:
	if validate_url(u):
	valid_urls.append(u)
	except ValueError:
	continue
	return valid_urls


	def resolve_hostname(hostname):
	# Get address information
	addr_info = socket.getaddrinfo(hostname, None)

	# Extract IP addresses from address information
	ipv4_addresses = [info[4][0] for info in addr_info if info[0] == socket.AF_INET]
	ipv6_addresses = [info[4][0] for info in addr_info if info[0] == socket.AF_INET6]

	return ipv4_addresses, ipv6_addresses


	def extract_metadata(soup, url):
	metadata = {"source": url}
	if title := soup.find("title"):
	metadata["title"] = title.get_text()
	if description := soup.find("meta", attrs={"name": "description"}):
	metadata["description"] = description.get("content", "No description found.")
	if html := soup.find("html"):
	metadata["language"] = html.get("lang", "No language found.")
	return metadata


	def verify_ssl_cert(url: str) -> bool:
	"""Verify SSL certificate for the given URL."""
	if not url.startswith("https://"):
	return True

	try:
	hostname = url.split("://")[-1].split("/")[0]
	context = ssl.create_default_context(cafile=certifi.where())
	with context.wrap_socket(ssl.socket(), server_hostname=hostname) as s:
	s.connect((hostname, 443))
	return True
	except ssl.SSLError:
	return False
	except Exception as e:
	log.warning(f"SSL verification failed for {url}: {str(e)}")
	return False


	class SafeFireCrawlLoader(BaseLoader):
	def __init__(
	self,
	web_paths,
	verify_ssl: bool = True,
	trust_env: bool = False,
	requests_per_second: Optional[float] = None,
	continue_on_failure: bool = True,
	api_key: Optional[str] = None,
	api_url: Optional[str] = None,
	mode: Literal["crawl", "scrape", "map"] = "crawl",
	proxy: Optional[Dict[str, str]] = None,
	params: Optional[Dict] = None,
	):
	"""Concurrent document loader for FireCrawl operations.

	Executes multiple FireCrawlLoader instances concurrently using thread pooling
	to improve bulk processing efficiency.
	Args:
	web_paths: List of URLs/paths to process.
	verify_ssl: If True, verify SSL certificates.
	trust_env: If True, use proxy settings from environment variables.
	requests_per_second: Number of requests per second to limit to.
	continue_on_failure (bool): If True, continue loading other URLs on failure.
	api_key: API key for FireCrawl service. Defaults to None
	(uses FIRE_CRAWL_API_KEY environment variable if not provided).
	api_url: Base URL for FireCrawl API. Defaults to official API endpoint.
	mode: Operation mode selection:
	- 'crawl': Website crawling mode (default)
	- 'scrape': Direct page scraping
	- 'map': Site map generation
	proxy: Proxy override settings for the FireCrawl API.
	params: The parameters to pass to the Firecrawl API.
	Examples include crawlerOptions.
	For more details, visit: https://github.com/mendableai/firecrawl-py
	"""
	proxy_server = proxy.get("server") if proxy else None
	if trust_env and not proxy_server:
	env_proxies = urllib.request.getproxies()
	env_proxy_server = env_proxies.get("https") or env_proxies.get("http")
	if env_proxy_server:
	if proxy:
	proxy["server"] = env_proxy_server
	else:
	proxy = {"server": env_proxy_server}
	self.web_paths = web_paths
	self.verify_ssl = verify_ssl
	self.requests_per_second = requests_per_second
	self.last_request_time = None
	self.trust_env = trust_env
	self.continue_on_failure = continue_on_failure
	self.api_key = api_key
	self.api_url = api_url
	self.mode = mode
	self.params = params

	def lazy_load(self) -> Iterator[Document]:
	"""Load documents concurrently using FireCrawl."""
	for url in self.web_paths:
	try:
	self._safe_process_url_sync(url)
	loader = FireCrawlLoader(
	url=url,
	api_key=self.api_key,
	api_url=self.api_url,
	mode=self.mode,
	params=self.params,
	)
	yield from loader.lazy_load()
	except Exception as e:
	if self.continue_on_failure:
	log.exception(e, "Error loading %s", url)
	continue
	raise e

	async def alazy_load(self):
	"""Async version of lazy_load."""
	for url in self.web_paths:
	try:
	await self._safe_process_url(url)
	loader = FireCrawlLoader(
	url=url,
	api_key=self.api_key,
	api_url=self.api_url,
	mode=self.mode,
	params=self.params,
	)
	async for document in loader.alazy_load():
	yield document
	except Exception as e:
	if self.continue_on_failure:
	log.exception(e, "Error loading %s", url)
	continue
	raise e

	def _verify_ssl_cert(self, url: str) -> bool:
	return verify_ssl_cert(url)

	async def _wait_for_rate_limit(self):
	"""Wait to respect the rate limit if specified."""
	if self.requests_per_second and self.last_request_time:
	min_interval = timedelta(seconds=1.0 / self.requests_per_second)
	time_since_last = datetime.now() - self.last_request_time
	if time_since_last < min_interval:
	await asyncio.sleep((min_interval - time_since_last).total_seconds())
	self.last_request_time = datetime.now()

	def _sync_wait_for_rate_limit(self):
	"""Synchronous version of rate limit wait."""
	if self.requests_per_second and self.last_request_time:
	min_interval = timedelta(seconds=1.0 / self.requests_per_second)
	time_since_last = datetime.now() - self.last_request_time
	if time_since_last < min_interval:
	time.sleep((min_interval - time_since_last).total_seconds())
	self.last_request_time = datetime.now()

	async def _safe_process_url(self, url: str) -> bool:
	"""Perform safety checks before processing a URL."""
	if self.verify_ssl and not self._verify_ssl_cert(url):
	raise ValueError(f"SSL certificate verification failed for {url}")
	await self._wait_for_rate_limit()
	return True

	def _safe_process_url_sync(self, url: str) -> bool:
	"""Synchronous version of safety checks."""
	if self.verify_ssl and not self._verify_ssl_cert(url):
	raise ValueError(f"SSL certificate verification failed for {url}")
	self._sync_wait_for_rate_limit()
	return True


	class SafePlaywrightURLLoader(PlaywrightURLLoader):
	"""Load HTML pages safely with Playwright, supporting SSL verification, rate limiting, and remote browser connection.

	Attributes:
	web_paths (List[str]): List of URLs to load.
	verify_ssl (bool): If True, verify SSL certificates.
	trust_env (bool): If True, use proxy settings from environment variables.
	requests_per_second (Optional[float]): Number of requests per second to limit to.
	continue_on_failure (bool): If True, continue loading other URLs on failure.
	headless (bool): If True, the browser will run in headless mode.
	proxy (dict): Proxy override settings for the Playwright session.
	playwright_ws_url (Optional[str]): WebSocket endpoint URI for remote browser connection.
	"""

	def __init__(
	self,
	web_paths: List[str],
	verify_ssl: bool = True,
	trust_env: bool = False,
	requests_per_second: Optional[float] = None,
	continue_on_failure: bool = True,
	headless: bool = True,
	remove_selectors: Optional[List[str]] = None,
	proxy: Optional[Dict[str, str]] = None,
	playwright_ws_url: Optional[str] = None,
	):
	"""Initialize with additional safety parameters and remote browser support."""

	proxy_server = proxy.get("server") if proxy else None
	if trust_env and not proxy_server:
	env_proxies = urllib.request.getproxies()
	env_proxy_server = env_proxies.get("https") or env_proxies.get("http")
	if env_proxy_server:
	if proxy:
	proxy["server"] = env_proxy_server
	else:
	proxy = {"server": env_proxy_server}

	# We'll set headless to False if using playwright_ws_url since it's handled by the remote browser
	super().__init__(
	urls=web_paths,
	continue_on_failure=continue_on_failure,
	headless=headless if playwright_ws_url is None else False,
	remove_selectors=remove_selectors,
	proxy=proxy,
	)
	self.verify_ssl = verify_ssl
	self.requests_per_second = requests_per_second
	self.last_request_time = None
	self.playwright_ws_url = playwright_ws_url
	self.trust_env = trust_env

	def lazy_load(self) -> Iterator[Document]:
	"""Safely load URLs synchronously with support for remote browser."""
	from playwright.sync_api import sync_playwright

	with sync_playwright() as p:
	# Use remote browser if ws_endpoint is provided, otherwise use local browser
	if self.playwright_ws_url:
	browser = p.chromium.connect(self.playwright_ws_url)
	else:
	browser = p.chromium.launch(headless=self.headless, proxy=self.proxy)

	for url in self.urls:
	try:
	self._safe_process_url_sync(url)
	page = browser.new_page()
	response = page.goto(url)
	if response is None:
	raise ValueError(f"page.goto() returned None for url {url}")

	text = self.evaluator.evaluate(page, browser, response)
	metadata = {"source": url}
	yield Document(page_content=text, metadata=metadata)
	except Exception as e:
	if self.continue_on_failure:
	log.exception(e, "Error loading %s", url)
	continue
	raise e
	browser.close()

	async def alazy_load(self) -> AsyncIterator[Document]:
	"""Safely load URLs asynchronously with support for remote browser."""
	from playwright.async_api import async_playwright

	async with async_playwright() as p:
	# Use remote browser if ws_endpoint is provided, otherwise use local browser
	if self.playwright_ws_url:
	browser = await p.chromium.connect(self.playwright_ws_url)
	else:
	browser = await p.chromium.launch(
	headless=self.headless, proxy=self.proxy
	)

	for url in self.urls:
	try:
	await self._safe_process_url(url)
	page = await browser.new_page()
	response = await page.goto(url)
	if response is None:
	raise ValueError(f"page.goto() returned None for url {url}")

	text = await self.evaluator.evaluate_async(page, browser, response)
	metadata = {"source": url}
	yield Document(page_content=text, metadata=metadata)
	except Exception as e:
	if self.continue_on_failure:
	log.exception(e, "Error loading %s", url)
	continue
	raise e
	await browser.close()

	def _verify_ssl_cert(self, url: str) -> bool:
	return verify_ssl_cert(url)

	async def _wait_for_rate_limit(self):
	"""Wait to respect the rate limit if specified."""
	if self.requests_per_second and self.last_request_time:
	min_interval = timedelta(seconds=1.0 / self.requests_per_second)
	time_since_last = datetime.now() - self.last_request_time
	if time_since_last < min_interval:
	await asyncio.sleep((min_interval - time_since_last).total_seconds())
	self.last_request_time = datetime.now()

	def _sync_wait_for_rate_limit(self):
	"""Synchronous version of rate limit wait."""
	if self.requests_per_second and self.last_request_time:
	min_interval = timedelta(seconds=1.0 / self.requests_per_second)
	time_since_last = datetime.now() - self.last_request_time
	if time_since_last < min_interval:
	time.sleep((min_interval - time_since_last).total_seconds())
	self.last_request_time = datetime.now()

	async def _safe_process_url(self, url: str) -> bool:
	"""Perform safety checks before processing a URL."""
	if self.verify_ssl and not self._verify_ssl_cert(url):
	raise ValueError(f"SSL certificate verification failed for {url}")
	await self._wait_for_rate_limit()
	return True

	def _safe_process_url_sync(self, url: str) -> bool:
	"""Synchronous version of safety checks."""
	if self.verify_ssl and not self._verify_ssl_cert(url):
	raise ValueError(f"SSL certificate verification failed for {url}")
	self._sync_wait_for_rate_limit()
	return True


	class SafeWebBaseLoader(WebBaseLoader):
	"""WebBaseLoader with enhanced error handling for URLs."""

	def __init__(self, trust_env: bool = False, args, *kwargs):
	"""Initialize SafeWebBaseLoader
	Args:
	trust_env (bool, optional): set to True if using proxy to make web requests, for example
	using http(s)_proxy environment variables. Defaults to False.
	"""
	super().__init__(args, *kwargs)
	self.trust_env = trust_env

	async def _fetch(
	self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5
	) -> str:
	async with aiohttp.ClientSession(trust_env=self.trust_env) as session:
	for i in range(retries):
	try:
	kwargs: Dict = dict(
	headers=self.session.headers,
	cookies=self.session.cookies.get_dict(),
	)
	if not self.session.verify:
	kwargs["ssl"] = False

	async with session.get(
	url, **(self.requests_kwargs \| kwargs)
	) as response:
	if self.raise_for_status:
	response.raise_for_status()
	return await response.text()
	except aiohttp.ClientConnectionError as e:
	if i == retries - 1:
	raise
	else:
	log.warning(
	f"Error fetching {url} with attempt "
	f"{i + 1}/{retries}: {e}. Retrying..."
	)
	await asyncio.sleep(cooldown * backoff**i)
	raise ValueError("retry count exceeded")

	def _unpack_fetch_results(
	self, results: Any, urls: List[str], parser: Union[str, None] = None
	) -> List[Any]:
	"""Unpack fetch results into BeautifulSoup objects."""
	from bs4 import BeautifulSoup

	final_results = []
	for i, result in enumerate(results):
	url = urls[i]
	if parser is None:
	if url.endswith(".xml"):
	parser = "xml"
	else:
	parser = self.default_parser
	self._check_parser(parser)
	final_results.append(BeautifulSoup(result, parser, **self.bs_kwargs))
	return final_results

	async def ascrape_all(
	self, urls: List[str], parser: Union[str, None] = None
	) -> List[Any]:
	"""Async fetch all urls, then return soups for all results."""
	results = await self.fetch_all(urls)
	return self._unpack_fetch_results(results, urls, parser=parser)

	def lazy_load(self) -> Iterator[Document]:
	"""Lazy load text from the url(s) in web_path with error handling."""
	for path in self.web_paths:
	try:
	soup = self._scrape(path, bs_kwargs=self.bs_kwargs)
	text = soup.get_text(**self.bs_get_text_kwargs)

	# Build metadata
	metadata = extract_metadata(soup, path)

	yield Document(page_content=text, metadata=metadata)
	except Exception as e:
	# Log the error and continue with the next URL
	log.exception(e, "Error loading %s", path)

	async def alazy_load(self) -> AsyncIterator[Document]:
	"""Async lazy load text from the url(s) in web_path."""
	results = await self.ascrape_all(self.web_paths)
	for path, soup in zip(self.web_paths, results):
	text = soup.get_text(**self.bs_get_text_kwargs)
	metadata = {"source": path}
	if title := soup.find("title"):
	metadata["title"] = title.get_text()
	if description := soup.find("meta", attrs={"name": "description"}):
	metadata["description"] = description.get(
	"content", "No description found."
	)
	if html := soup.find("html"):
	metadata["language"] = html.get("lang", "No language found.")
	yield Document(page_content=text, metadata=metadata)

	async def aload(self) -> list[Document]:
	"""Load data into Document objects."""
	return [document async for document in self.alazy_load()]


	RAG_WEB_LOADER_ENGINES = defaultdict(lambda: SafeWebBaseLoader)
	RAG_WEB_LOADER_ENGINES["playwright"] = SafePlaywrightURLLoader
	RAG_WEB_LOADER_ENGINES["safe_web"] = SafeWebBaseLoader
	RAG_WEB_LOADER_ENGINES["firecrawl"] = SafeFireCrawlLoader


	def get_web_loader(
	urls: Union[str, Sequence[str]],
	verify_ssl: bool = True,
	requests_per_second: int = 2,
	trust_env: bool = False,
	):
	# Check if the URLs are valid
	safe_urls = safe_validate_urls([urls] if isinstance(urls, str) else urls)

	web_loader_args = {
	"web_paths": safe_urls,
	"verify_ssl": verify_ssl,
	"requests_per_second": requests_per_second,
	"continue_on_failure": True,
	"trust_env": trust_env,
	}

	if PLAYWRIGHT_WS_URI.value:
	web_loader_args["playwright_ws_url"] = PLAYWRIGHT_WS_URI.value

	if RAG_WEB_LOADER_ENGINE.value == "firecrawl":
	web_loader_args["api_key"] = FIRECRAWL_API_KEY.value
	web_loader_args["api_url"] = FIRECRAWL_API_BASE_URL.value

	# Create the appropriate WebLoader based on the configuration
	WebLoaderClass = RAG_WEB_LOADER_ENGINES[RAG_WEB_LOADER_ENGINE.value]
	web_loader = WebLoaderClass(**web_loader_args)

	log.debug(
	"Using RAG_WEB_LOADER_ENGINE %s for %s URLs",
	web_loader.__class__.__name__,
	len(safe_urls),
	)

	return web_loader