from urllib.parse import urlparse from app.core.exceptions import InvalidDocumentURLError def validate_document_url(url: str) -> str: """Validate that the given string is a well-formed http(s) URL. Args: url: The URL to validate. Returns: The trimmed URL. Raises: InvalidDocumentURLError: If the URL is malformed. """ if not url or not isinstance(url, str): raise InvalidDocumentURLError("Document URL must be a non-empty string.") url = url.strip() parsed = urlparse(url) if parsed.scheme not in {"http", "https"}: raise InvalidDocumentURLError( f"Unsupported URL scheme: '{parsed.scheme}'. Use http or https." ) if not parsed.netloc: raise InvalidDocumentURLError("Document URL is missing a valid host.") return url