|
|
import re |
|
|
|
|
|
from langchain_community.document_loaders import AsyncHtmlLoader, WebBaseLoader |
|
|
|
|
|
from langflow.custom import Component |
|
|
from langflow.helpers.data import data_to_text |
|
|
from langflow.io import DropdownInput, MessageTextInput, Output |
|
|
from langflow.schema import Data |
|
|
from langflow.schema.message import Message |
|
|
|
|
|
|
|
|
class URLComponent(Component): |
|
|
display_name = "URL" |
|
|
description = "Fetch content from one or more URLs." |
|
|
icon = "layout-template" |
|
|
name = "URL" |
|
|
|
|
|
inputs = [ |
|
|
MessageTextInput( |
|
|
name="urls", |
|
|
display_name="URLs", |
|
|
info="Enter one or more URLs, by clicking the '+' button.", |
|
|
is_list=True, |
|
|
tool_mode=True, |
|
|
), |
|
|
DropdownInput( |
|
|
name="format", |
|
|
display_name="Output Format", |
|
|
info="Output Format. Use 'Text' to extract the text from the HTML or 'Raw HTML' for the raw HTML content.", |
|
|
options=["Text", "Raw HTML"], |
|
|
value="Text", |
|
|
), |
|
|
] |
|
|
|
|
|
outputs = [ |
|
|
Output(display_name="Data", name="data", method="fetch_content"), |
|
|
Output(display_name="Text", name="text", method="fetch_content_text"), |
|
|
] |
|
|
|
|
|
def ensure_url(self, string: str) -> str: |
|
|
"""Ensures the given string is a URL by adding 'http://' if it doesn't start with 'http://' or 'https://'. |
|
|
|
|
|
Raises an error if the string is not a valid URL. |
|
|
|
|
|
Parameters: |
|
|
string (str): The string to be checked and possibly modified. |
|
|
|
|
|
Returns: |
|
|
str: The modified string that is ensured to be a URL. |
|
|
|
|
|
Raises: |
|
|
ValueError: If the string is not a valid URL. |
|
|
""" |
|
|
if not string.startswith(("http://", "https://")): |
|
|
string = "http://" + string |
|
|
|
|
|
|
|
|
url_regex = re.compile( |
|
|
r"^(https?:\/\/)?" |
|
|
r"(www\.)?" |
|
|
r"([a-zA-Z0-9.-]+)" |
|
|
r"(\.[a-zA-Z]{2,})?" |
|
|
r"(:\d+)?" |
|
|
r"(\/[^\s]*)?$", |
|
|
re.IGNORECASE, |
|
|
) |
|
|
|
|
|
if not url_regex.match(string): |
|
|
msg = f"Invalid URL: {string}" |
|
|
raise ValueError(msg) |
|
|
|
|
|
return string |
|
|
|
|
|
def fetch_content(self) -> list[Data]: |
|
|
urls = [self.ensure_url(url.strip()) for url in self.urls if url.strip()] |
|
|
if self.format == "Raw HTML": |
|
|
loader = AsyncHtmlLoader(web_path=urls, encoding="utf-8") |
|
|
else: |
|
|
loader = WebBaseLoader(web_paths=urls, encoding="utf-8") |
|
|
docs = loader.load() |
|
|
data = [Data(text=doc.page_content, **doc.metadata) for doc in docs] |
|
|
self.status = data |
|
|
return data |
|
|
|
|
|
def fetch_content_text(self) -> Message: |
|
|
data = self.fetch_content() |
|
|
|
|
|
result_string = data_to_text("{text}", data) |
|
|
self.status = result_string |
|
|
return Message(text=result_string) |
|
|
|