| | import { render } from 'cheerio-to-text' |
| |
|
| | import type { Record } from '@/search/scripts/scrape/types' |
| |
|
| | |
| | |
| | |
| |
|
| | const ignoredHeadingSlugs = ['in-this-article', 'further-reading', 'prerequisites'] |
| |
|
| | export default function parsePageSectionsIntoRecords(page: any): Record { |
| | const { href, $ } = page |
| | const title = $('h1').first().text().trim() |
| | const breadcrumbsArray = $('[data-search=breadcrumbs] nav.breadcrumbs a') |
| | .map((i: number, el: any) => { |
| | return $(el).text().trim().replace('/', '').replace(/\s+/g, ' ') |
| | }) |
| | .get() |
| |
|
| | |
| | |
| | |
| | $('[data-search=hide]').remove() |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | const breadcrumbs = |
| | breadcrumbsArray |
| | .slice(0, breadcrumbsArray.length > 1 ? -1 : breadcrumbsArray.length) |
| | .join(' / ') || '' |
| |
|
| | const toplevel = breadcrumbsArray[0] || '' |
| | const objectID = href |
| |
|
| | const rootSelector = '[data-search=article-body]' |
| | const $root = $(rootSelector) |
| | if ($root.length === 0) { |
| | console.warn(`${href} has no '${rootSelector}'`) |
| | } else if ($root.length > 1) { |
| | console.warn(`${href} has more than one '${rootSelector}' (${$root.length})`) |
| | } |
| |
|
| | const $sections = $('h2', $root) |
| | .filter('[id]') |
| | .filter((i: number, el: any) => { |
| | return !ignoredHeadingSlugs.includes($(el).attr('id')) |
| | }) |
| |
|
| | const headings = $sections |
| | .map((i: number, el: any) => $(el).text()) |
| | .get() |
| | .join('\n') |
| | .trim() |
| |
|
| | const intro = $('[data-search=lead] p').text().trim() |
| |
|
| | let body = '' |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | if ($root.length > 0) { |
| | body = render($root) |
| | } |
| |
|
| | if (!body && !intro) { |
| | console.warn(`${objectID} has no body and no intro.`) |
| | } |
| |
|
| | const content = |
| | intro && !body.includes(intro.trim()) ? `${intro.trim()}\n${body.trim()}`.trim() : body.trim() |
| |
|
| | return { |
| | objectID, |
| | breadcrumbs, |
| | title, |
| | headings, |
| | content, |
| | intro, |
| | toplevel, |
| | } |
| | } |
| |
|