| | import chalk from 'chalk' |
| |
|
| | import languages from '@/languages/lib/languages-server' |
| | import buildRecords from '@/search/scripts/scrape/lib/build-records' |
| | import findIndexablePages from '@/search/scripts/scrape/lib/find-indexable-pages' |
| | import { writeIndexRecords } from '@/search/scripts/scrape/lib/search-index-records' |
| | import { getElasticSearchIndex } from '@/search/lib/elasticsearch-indexes' |
| |
|
| | import type { Options, Config, Page, Redirects } from '@/search/scripts/scrape/types' |
| |
|
| | |
| | |
| | export default async function scrapeIntoIndexJson({ |
| | language, |
| | notLanguage, |
| | outDirectory, |
| | versionsToBuild, |
| | config = {} as Config, |
| | }: Options): Promise<void> { |
| | const t0 = new Date() |
| |
|
| | |
| | const languagesToBuild = Object.keys(languages).filter((lang) => |
| | notLanguage ? notLanguage !== lang : language ? language === lang : true, |
| | ) |
| |
|
| | console.log( |
| | `Building indices for language: ${chalk.yellow(language || 'all languages')} and version: ${chalk.yellow( |
| | versionsToBuild.length === 1 ? versionsToBuild[0] : 'all versions', |
| | )}.\n`, |
| | ) |
| |
|
| | |
| | const indexablePages: Page[] = await findIndexablePages(config.filter) |
| | const redirects: Redirects = {} |
| | for (const page of indexablePages) { |
| | const href = page.relativePath.replace('index.md', '').replace('.md', '') |
| | for (let redirectFrom of page.redirect_from || []) { |
| | |
| | |
| | if (redirectFrom.startsWith('/')) redirectFrom = redirectFrom.slice(1) |
| | if (redirectFrom.endsWith('/')) redirectFrom = redirectFrom.slice(0, -1) |
| | redirects[redirectFrom] = href |
| | } |
| | } |
| |
|
| | let countRecordsTotal = 0 |
| | let totalFailedPages = 0 |
| | const allFailures: Array<{ |
| | indexName: string |
| | languageCode: string |
| | indexVersion: string |
| | failures: Array<{ url?: string; relativePath?: string; error: string; errorType: string }> |
| | }> = [] |
| |
|
| | |
| | for (const languageCode of languagesToBuild) { |
| | for (const indexVersion of versionsToBuild) { |
| | const { indexName } = getElasticSearchIndex('generalSearch', indexVersion, languageCode) |
| |
|
| | |
| | const { records, failedPages } = await buildRecords( |
| | indexName, |
| | indexablePages, |
| | indexVersion, |
| | languageCode, |
| | redirects, |
| | config, |
| | ) |
| | countRecordsTotal += records.length |
| |
|
| | if (failedPages.length > 0) { |
| | totalFailedPages += failedPages.length |
| | allFailures.push({ |
| | indexName, |
| | languageCode, |
| | indexVersion, |
| | failures: failedPages, |
| | }) |
| | } |
| |
|
| | const fileWritten = await writeIndexRecords(indexName, records, outDirectory) |
| | console.log(`wrote records to ${fileWritten}`) |
| | } |
| | } |
| | const t1 = new Date() |
| | const tookSec = (t1.getTime() - t0.getTime()) / 1000 |
| |
|
| | console.log('\nDone!') |
| | console.log(`Took ${chalk.bold(formatSeconds(tookSec))}`) |
| | const rate = (countRecordsTotal / tookSec).toFixed(1) |
| | console.log(`Rate ~${chalk.bold(rate)} pages per second.`) |
| |
|
| | |
| | if (totalFailedPages > 0) { |
| | const fs = await import('fs') |
| | const path = await import('path') |
| | const failuresSummaryPath = path.join(outDirectory, 'failures-summary.json') |
| | await fs.promises.writeFile( |
| | failuresSummaryPath, |
| | JSON.stringify( |
| | { |
| | totalFailedPages, |
| | failures: allFailures, |
| | }, |
| | null, |
| | 2, |
| | ), |
| | ) |
| | console.log(`\n${chalk.yellow('⚠')} Wrote failures summary to ${failuresSummaryPath}`) |
| | } |
| | } |
| |
|
| | function formatSeconds(seconds: number): string { |
| | return new Date(seconds * 1000).toISOString().substr(11, 8) |
| | } |
| |
|