Spaces:
Sleeping
Sleeping
| import unicodedata | |
| from pathlib import Path | |
| from typing import Any, Callable, Dict, List, Literal, Optional, cast | |
| from llama_index.core.node_parser.interface import TextSplitter | |
| from llama_index.core.readers.base import BaseReader | |
| from llama_index.core.schema import Document | |
| path = Path(__file__).parent / "Readability.js" | |
| def nfkc_normalize(text: str) -> str: | |
| return unicodedata.normalize("NFKC", text) | |
| class ReadabilityWebPageReader(BaseReader): | |
| """Readability Webpage Loader. | |
| Extracting relevant information from a fully rendered web page. | |
| During the processing, it is always assumed that web pages used as data sources contain textual content. | |
| 1. Load the page and wait for it rendered. (playwright) | |
| 2. Inject Readability.js to extract the main content. | |
| Args: | |
| proxy (Optional[str], optional): Proxy server. Defaults to None. | |
| wait_until (Optional[Literal["commit", "domcontentloaded", "load", "networkidle"]], optional): Wait until the page is loaded. Defaults to "domcontentloaded". | |
| text_splitter (TextSplitter, optional): Text splitter. Defaults to None. | |
| normalizer (Optional[Callable[[str], str]], optional): Text normalizer. Defaults to nfkc_normalize. | |
| """ | |
| def __init__( | |
| self, | |
| proxy: Optional[str] = None, | |
| wait_until: Optional[ | |
| Literal["commit", "domcontentloaded", "load", "networkidle"] | |
| ] = "domcontentloaded", | |
| text_splitter: Optional[TextSplitter] = None, | |
| normalize: Optional[Callable[[str], str]] = nfkc_normalize, | |
| ) -> None: | |
| self._launch_options = { | |
| "headless": True, | |
| } | |
| self._wait_until = wait_until | |
| if proxy: | |
| self._launch_options["proxy"] = { | |
| "server": proxy, | |
| } | |
| self._text_splitter = text_splitter | |
| self._normalize = normalize | |
| self._readability_js = None | |
| def load_data(self, url: str) -> List[Document]: | |
| """Render and load data content from url. | |
| Args: | |
| url (str): URL to scrape. | |
| Returns: | |
| List[Document]: List of documents. | |
| """ | |
| from playwright.sync_api import sync_playwright | |
| with sync_playwright() as p: | |
| browser = p.chromium.launch(**self._launch_options) | |
| article = self.scrape_page( | |
| browser, | |
| url, | |
| ) | |
| extra_info = { | |
| key: article[key] | |
| for key in [ | |
| "title", | |
| "length", | |
| "excerpt", | |
| "byline", | |
| "dir", | |
| "lang", | |
| "siteName", | |
| ] | |
| } | |
| if self._normalize is not None: | |
| article["textContent"] = self._normalize(article["textContent"]) | |
| texts = [] | |
| if self._text_splitter is not None: | |
| texts = self._text_splitter.split_text(article["textContent"]) | |
| else: | |
| texts = [article["textContent"]] | |
| browser.close() | |
| return [Document(text=x, extra_info=extra_info) for x in texts] | |
| def scrape_page( | |
| self, | |
| browser: Any, | |
| url: str, | |
| ) -> Dict[str, str]: | |
| """Scrape a single article url. | |
| Args: | |
| browser (Any): a Playwright Chromium browser. | |
| url (str): URL of the article to scrape. | |
| Returns: | |
| Ref: https://github.com/mozilla/readability | |
| title: article title; | |
| content: HTML string of processed article content; | |
| textContent: text content of the article, with all the HTML tags removed; | |
| length: length of an article, in characters; | |
| excerpt: article description, or short excerpt from the content; | |
| byline: author metadata; | |
| dir: content direction; | |
| siteName: name of the site. | |
| lang: content language | |
| """ | |
| from playwright.sync_api._generated import Browser | |
| if self._readability_js is None: | |
| with open(path) as f: | |
| self._readability_js = f.read() | |
| inject_readability = f""" | |
| (function(){{ | |
| {self._readability_js} | |
| function executor() {{ | |
| return new Readability({{}}, document).parse(); | |
| }} | |
| return executor(); | |
| }}()) | |
| """ | |
| browser = cast(Browser, browser) | |
| page = browser.new_page(ignore_https_errors=True) | |
| page.set_default_timeout(60000) | |
| page.goto(url, wait_until=self._wait_until) | |
| r = page.evaluate(inject_readability) | |
| page.close() | |
| print("scraped:", url) | |
| return r | |