Spaces:
Sleeping
Sleeping
| """Simple Web scraper.""" | |
| from typing import List, Optional, Dict, Callable | |
| import requests | |
| from llama_index.core.bridge.pydantic import PrivateAttr | |
| from llama_index.core.readers.base import BasePydanticReader | |
| from llama_index.core.schema import Document | |
| class SimpleWebPageReader(BasePydanticReader): | |
| """Simple web page reader. | |
| Reads pages from the web. | |
| Args: | |
| html_to_text (bool): Whether to convert HTML to text. | |
| Requires `html2text` package. | |
| metadata_fn (Optional[Callable[[str], Dict]]): A function that takes in | |
| a URL and returns a dictionary of metadata. | |
| Default is None. | |
| """ | |
| is_remote: bool = True | |
| html_to_text: bool | |
| _metadata_fn: Optional[Callable[[str], Dict]] = PrivateAttr() | |
| def __init__( | |
| self, | |
| html_to_text: bool = False, | |
| metadata_fn: Optional[Callable[[str], Dict]] = None, | |
| ) -> None: | |
| """Initialize with parameters.""" | |
| try: | |
| import html2text # noqa | |
| except ImportError: | |
| raise ImportError( | |
| "`html2text` package not found, please run `pip install html2text`" | |
| ) | |
| self._metadata_fn = metadata_fn | |
| super().__init__(html_to_text=html_to_text) | |
| def class_name(cls) -> str: | |
| return "SimpleWebPageReader" | |
| def load_data(self, urls: List[str]) -> List[Document]: | |
| """Load data from the input directory. | |
| Args: | |
| urls (List[str]): List of URLs to scrape. | |
| Returns: | |
| List[Document]: List of documents. | |
| """ | |
| if not isinstance(urls, list): | |
| raise ValueError("urls must be a list of strings.") | |
| documents = [] | |
| for url in urls: | |
| response = requests.get(url, headers=None).text | |
| if self.html_to_text: | |
| import html2text | |
| response = html2text.html2text(response) | |
| metadata: Optional[Dict] = None | |
| if self._metadata_fn is not None: | |
| metadata = self._metadata_fn(url) | |
| documents.append(Document(text=response, id_=url, metadata=metadata or {})) | |
| return documents | |