Spaces:
Running
Running
| import re | |
| from langchain_community.document_loaders import AsyncHtmlLoader, WebBaseLoader | |
| from langflow.custom import Component | |
| from langflow.helpers.data import data_to_text | |
| from langflow.io import DropdownInput, MessageTextInput, Output | |
| from langflow.schema import Data | |
| from langflow.schema.message import Message | |
| class URLComponent(Component): | |
| display_name = "URL" | |
| description = "Fetch content from one or more URLs." | |
| icon = "layout-template" | |
| name = "URL" | |
| inputs = [ | |
| MessageTextInput( | |
| name="urls", | |
| display_name="URLs", | |
| info="Enter one or more URLs, by clicking the '+' button.", | |
| is_list=True, | |
| tool_mode=True, | |
| ), | |
| DropdownInput( | |
| name="format", | |
| display_name="Output Format", | |
| info="Output Format. Use 'Text' to extract the text from the HTML or 'Raw HTML' for the raw HTML content.", | |
| options=["Text", "Raw HTML"], | |
| value="Text", | |
| ), | |
| ] | |
| outputs = [ | |
| Output(display_name="Data", name="data", method="fetch_content"), | |
| Output(display_name="Text", name="text", method="fetch_content_text"), | |
| ] | |
| def ensure_url(self, string: str) -> str: | |
| """Ensures the given string is a URL by adding 'http://' if it doesn't start with 'http://' or 'https://'. | |
| Raises an error if the string is not a valid URL. | |
| Parameters: | |
| string (str): The string to be checked and possibly modified. | |
| Returns: | |
| str: The modified string that is ensured to be a URL. | |
| Raises: | |
| ValueError: If the string is not a valid URL. | |
| """ | |
| if not string.startswith(("http://", "https://")): | |
| string = "http://" + string | |
| # Basic URL validation regex | |
| url_regex = re.compile( | |
| r"^(https?:\/\/)?" # optional protocol | |
| r"(www\.)?" # optional www | |
| r"([a-zA-Z0-9.-]+)" # domain | |
| r"(\.[a-zA-Z]{2,})?" # top-level domain | |
| r"(:\d+)?" # optional port | |
| r"(\/[^\s]*)?$", # optional path | |
| re.IGNORECASE, | |
| ) | |
| if not url_regex.match(string): | |
| msg = f"Invalid URL: {string}" | |
| raise ValueError(msg) | |
| return string | |
| def fetch_content(self) -> list[Data]: | |
| urls = [self.ensure_url(url.strip()) for url in self.urls if url.strip()] | |
| if self.format == "Raw HTML": | |
| loader = AsyncHtmlLoader(web_path=urls, encoding="utf-8") | |
| else: | |
| loader = WebBaseLoader(web_paths=urls, encoding="utf-8") | |
| docs = loader.load() | |
| data = [Data(text=doc.page_content, **doc.metadata) for doc in docs] | |
| self.status = data | |
| return data | |
| def fetch_content_text(self) -> Message: | |
| data = self.fetch_content() | |
| result_string = data_to_text("{text}", data) | |
| self.status = result_string | |
| return Message(text=result_string) | |