Spaces:

rizoa-auchan-hack
/

hack

Sleeping

hack / llama_index /readers /web /knowledge_base /base.py

Cédric KACZMAREK

first commit

70b87af almost 2 years ago

5.61 kB

	from typing import Any, Dict, List, Optional

	from llama_index.core.readers.base import BaseReader
	from llama_index.core.schema import Document


	class KnowledgeBaseWebReader(BaseReader):
	"""Knowledge base reader.

	Crawls and reads articles from a knowledge base/help center with Playwright.
	Tested on Zendesk and Intercom CMS, may work on others.
	Can be run in headless mode but it may be blocked by Cloudflare. Run it headed to be safe.
	Times out occasionally, just increase the default time out if it does.
	Requires the `playwright` package.

	Args:
	root_url (str): the base url of the knowledge base, with no trailing slash
	e.g. 'https://support.intercom.com'
	link_selectors (List[str]): list of css selectors to find links to articles while crawling
	e.g. ['.article-list a', '.article-list a']
	article_path (str): the url path of articles on this domain so the crawler knows when to stop
	e.g. '/articles'
	title_selector (Optional[str]): css selector to find the title of the article
	e.g. '.article-title'
	subtitle_selector (Optional[str]): css selector to find the subtitle/description of the article
	e.g. '.article-subtitle'
	body_selector (Optional[str]): css selector to find the body of the article
	e.g. '.article-body'
	"""

	def __init__(
	self,
	root_url: str,
	link_selectors: List[str],
	article_path: str,
	title_selector: Optional[str] = None,
	subtitle_selector: Optional[str] = None,
	body_selector: Optional[str] = None,
	) -> None:
	"""Initialize with parameters."""
	self.root_url = root_url
	self.link_selectors = link_selectors
	self.article_path = article_path
	self.title_selector = title_selector
	self.subtitle_selector = subtitle_selector
	self.body_selector = body_selector

	def load_data(self) -> List[Document]:
	"""Load data from the knowledge base."""
	from playwright.sync_api import sync_playwright

	with sync_playwright() as p:
	browser = p.chromium.launch(headless=False)

	# Crawl
	article_urls = self.get_article_urls(
	browser,
	self.root_url,
	self.root_url,
	)

	# Scrape
	documents = []
	for url in article_urls:
	article = self.scrape_article(
	browser,
	url,
	)
	extra_info = {
	"title": article["title"],
	"subtitle": article["subtitle"],
	"url": article["url"],
	}
	documents.append(Document(text=article["body"], extra_info=extra_info))

	browser.close()

	return documents

	def scrape_article(
	self,
	browser: Any,
	url: str,
	) -> Dict[str, str]:
	"""Scrape a single article url.

	Args:
	browser (Any): a Playwright Chromium browser.
	url (str): URL of the article to scrape.

	Returns:
	Dict[str, str]: a mapping of article attributes to their values.

	"""
	page = browser.new_page(ignore_https_errors=True)
	page.set_default_timeout(60000)
	page.goto(url, wait_until="domcontentloaded")

	title = (
	(
	page.query_selector(self.title_selector).evaluate(
	"node => node.innerText"
	)
	)
	if self.title_selector
	else ""
	)
	subtitle = (
	(
	page.query_selector(self.subtitle_selector).evaluate(
	"node => node.innerText"
	)
	)
	if self.subtitle_selector
	else ""
	)
	body = (
	(page.query_selector(self.body_selector).evaluate("node => node.innerText"))
	if self.body_selector
	else ""
	)

	page.close()
	print("scraped:", url)
	return {"title": title, "subtitle": subtitle, "body": body, "url": url}

	def get_article_urls(
	self, browser: Any, root_url: str, current_url: str
	) -> List[str]:
	"""Recursively crawl through the knowledge base to find a list of articles.

	Args:
	browser (Any): a Playwright Chromium browser.
	root_url (str): root URL of the knowledge base.
	current_url (str): current URL that is being crawled.

	Returns:
	List[str]: a list of URLs of found articles.

	"""
	page = browser.new_page(ignore_https_errors=True)
	page.set_default_timeout(60000)
	page.goto(current_url, wait_until="domcontentloaded")

	# If this is a leaf node aka article page, return itself
	if self.article_path in current_url:
	print("Found an article: ", current_url)
	page.close()
	return [current_url]

	# Otherwise crawl this page and find all the articles linked from it
	article_urls = []
	links = []

	for link_selector in self.link_selectors:
	ahrefs = page.query_selector_all(link_selector)
	links.extend(ahrefs)

	for link in links:
	url = root_url + page.evaluate("(node) => node.getAttribute('href')", link)
	article_urls.extend(self.get_article_urls(browser, root_url, url))

	page.close()

	return article_urls