import { render } from 'cheerio-to-text' import type { Record } from '@/search/scripts/scrape/types' // This module takes cheerio page object and divides it into sections // using H1,H2 heading elements as section delimiters. The text // that follows each heading becomes the content of the search record. const ignoredHeadingSlugs = ['in-this-article', 'further-reading', 'prerequisites'] export default function parsePageSectionsIntoRecords(page: any): Record { const { href, $ } = page const title = $('h1').first().text().trim() const breadcrumbsArray = $('[data-search=breadcrumbs] nav.breadcrumbs a') .map((i: number, el: any) => { return $(el).text().trim().replace('/', '').replace(/\s+/g, ' ') }) .get() // Like in printing from DOM, some elements should not be included in // the records for search. This might be navigational elements of the // page that don't make much sense to find in a site search. $('[data-search=hide]').remove() // Only slice off the last one if the length of the array is greater than 1 // On an article page, we the breadcrumbs array will be something // like: // // ['Product short title', 'Subcategory', 'Article title'] // // But on a product landing page, it'll just be: // // ['Product short title'] // // So here, if we skip the last one we get nothing for the breadcrumb. const breadcrumbs = breadcrumbsArray .slice(0, breadcrumbsArray.length > 1 ? -1 : breadcrumbsArray.length) .join(' / ') || '' const toplevel = breadcrumbsArray[0] || '' const objectID = href const rootSelector = '[data-search=article-body]' const $root = $(rootSelector) if ($root.length === 0) { console.warn(`${href} has no '${rootSelector}'`) } else if ($root.length > 1) { console.warn(`${href} has more than one '${rootSelector}' (${$root.length})`) } const $sections = $('h2', $root) .filter('[id]') .filter((i: number, el: any) => { return !ignoredHeadingSlugs.includes($(el).attr('id')) }) const headings = $sections .map((i: number, el: any) => $(el).text()) .get() .join('\n') .trim() const intro = $('[data-search=lead] p').text().trim() let body = '' // Typical example pages with no `$root` are: // https://docs.github.com/en/code-security/guides // // We need to avoid these because if you use `getAllText()` on these // pages, it will extract *everything* from the page, which will // include the side bar and footer. // TODO: Come up a custom solution to extract some text from these // pages that yields some decent content to be searched on, because // when you view these pages in a browser, there's clearly text there. if ($root.length > 0) { body = render($root) } if (!body && !intro) { console.warn(`${objectID} has no body and no intro.`) } const content = intro && !body.includes(intro.trim()) ? `${intro.trim()}\n${body.trim()}`.trim() : body.trim() return { objectID, breadcrumbs, title, headings, content, intro, toplevel, } }