/** * This script will loop over all pages, in all languages, and look at * the following: * * 1. `title` in frontmatter * 2. `intro` in frontmatter * 3. `shortTitle` in frontmatter (if present) * 4. the markdown body itself * 5. The `versions:` frontmatter key (if the page is in English) * * Then it will search out the features mentioned based on `data/features/*.yml` * It will make a Set of these (e.g. `dependabot-grouped-dependencies` and * `ghas-enablement-webhook`) and one by one pluck them away. * * After the pages, it will loop over the reusables in English, and do the * same search there. Once it's done the English, it loops over the * reusables in the translations (if they exist) and does the same search. * * Lastly, it will output the remaining features, as relative file paths. * For example, `data/features/havent-been-used-in-years.yml` so now you * know that file can be deleted. * * NOTE: A lot of translations have corrupted Liquid. So if we can't parse * the Liquid we fall back to string search. A regex will try to find * all `{% ifversion ... %}` (and `elsif`) and search for any features * mentioned inside that as a string. * */ import { strictEqual } from 'node:assert' import fs from 'fs' import path from 'path' import chalk from 'chalk' import { TokenizationError, TokenKind } from 'liquidjs' import type { TagToken } from 'liquidjs' import type { Page } from '@/types' import warmServer from '@/frame/lib/warm-server' import { getDeepDataByLanguage } from '@/data-directory/lib/get-data' import { getLiquidTokens } from '@/content-linter/lib/helpers/liquid-utils' import languages from '@/languages/lib/languages-server' import { correctTranslatedContentStrings } from '@/languages/lib/correct-translation-content' const EXCEPTIONS = new Set([ // From data/features/placeholder.yml. Used by tests. 'placeholder', ]) type Options = { sourceDirectory: string output?: string verbose?: boolean } export async function find(options: Options) { const { sourceDirectory } = options if (process.env.ENABLED_LANGUAGES && process.env.ENABLED_LANGUAGES === 'en') { console.warn( chalk.yellow( `Only English is enabled. Be careful with the output. To include all translations make sure they're available and that ENABLED_LANGUAGES is not set or set to 'all'.`.replaceAll(/\s\s+/g, ' '), ), ) } const site = await warmServer([]) const features = new Set( Object.keys(getDeepDataByLanguage('features', 'en')).filter((f) => !EXCEPTIONS.has(f)), ) if (options.verbose) { console.log(`Found ${features.size} features`) } const pageList: Page[] = site.pageList if (options.verbose) { console.log(`Searching ${pageList.length.toLocaleString()} pages`) } const t0 = new Date() searchAndRemove(features, pageList, Boolean(options.verbose)) const t1 = new Date() if (options.verbose) { const color = features.size === 0 ? chalk.green : chalk.yellow console.log( color( `Searched ${pageList.length.toLocaleString()} pages in ${formatDelta(t0, t1)}. And found ${features.size} features remaining (i.e. orphans).`.replace(/\s\s+/, ' '), ), ) } const remaining = Array.from(features).map((feature) => path.join(sourceDirectory, `${feature}.yml`), ) if (options.output) { if (options.output.endsWith('.json')) { if (remaining.length) { fs.writeFileSync(options.output, JSON.stringify(remaining, null, 2)) } } else { fs.writeFileSync(options.output, remaining.join('\n')) } if (!options.verbose) { return } } console.log(chalk.bold(`Orphans found (${remaining.length}):`)) for (const feature of remaining) { console.log(chalk.green(feature)) } } function formatDelta(t0: Date, t1: Date) { const ms = t1.getTime() - t0.getTime() return `${(ms / 1000).toFixed(1)} seconds` } function searchAndRemove(features: Set, pages: Page[], verbose = false) { for (const page of pages) { const content = page.markdown // We actually never bother looking at the `versions:` frontmatter // key in translations, so it doesn't matter if the translated // frontmatter might have `versions: some-old-feature`. if (page.languageCode === 'en') { for (const [key, value] of Object.entries(page.versions)) { if (key === 'feature') { if (features.has(value)) { features.delete(value) } } } } const combined = ` ${content} ${page.title || ''} ${page.shortTitle || ''} ${page.intro || ''} ` checkString(combined, features, { page, verbose, languageCode: page.languageCode }) } // Reusables are a bit special, as they are shared between languages. // There'll always be a slight mismatch between files present on disk // in English vs. translations. // The translations never delete files, so there's often excess reusables // on disk in translations. And the English might be ahead, meaning a file // has been introduced in English but not yet translated. // The code below loops over the English reusables, and takes note of the // their relative paths and content. Then, we re-use the keys of that map // to know which files, in the translations, to check. And when we read // them in, we'll need the English equivalent content to be able to // use the correctTranslatedContentStrings function. // Check variables files for (const filePath of getVariableFiles(path.join(languages.en.dir, 'data', 'variables'))) { const fileContent = fs.readFileSync(filePath, 'utf-8') checkString(fileContent, features, { filePath, verbose, languageCode: 'en' }) } const englishReusables = new Map() for (const filePath of getReusableFiles(path.join(languages.en.dir, 'data', 'reusables'))) { const relativePath = path.relative(languages.en.dir, filePath) const fileContent = fs.readFileSync(filePath, 'utf-8') checkString(fileContent, features, { filePath, verbose, languageCode: 'en' }) englishReusables.set(relativePath, fileContent) } for (const language of Object.values(languages)) { if (language.code === 'en') continue // Already did that in the loop above for (const [relativePath, englishFileContent] of Array.from(englishReusables.entries())) { const filePath = path.join(language.dir, relativePath) try { const fileContent = fs.readFileSync(filePath, 'utf-8') const correctedFileContent = correctTranslatedContentStrings( fileContent, englishFileContent, { code: language.code, relativePath, }, ) checkString(correctedFileContent, features, { filePath, verbose, languageCode: language.code, }) } catch (error) { if (error instanceof Error && 'code' in error && error.code === 'ENOENT') { // That a reusable does *not* exist in a translation is // perfectly expected. It means that English reusable was // most likely added recently and the translation hasn't been // translated yet. continue } throw error } } } } export function getReusableFiles(root: string): string[] { const here = [] for (const file of fs.readdirSync(root)) { const filePath = `${root}/${file}` if (fs.statSync(filePath).isDirectory()) { here.push(...getReusableFiles(filePath)) } else if (file.endsWith('.md') && file !== 'README.md') { here.push(filePath) } } return here } export function getVariableFiles(root: string): string[] { const here = [] for (const file of fs.readdirSync(root)) { const filePath = `${root}/${file}` if (fs.statSync(filePath).isDirectory()) { here.push(...getVariableFiles(filePath)) } else if (file.endsWith('.yml') && file !== 'README.yml') { here.push(filePath) } } return here } const IGNORE_ARGS = new Set(['or', 'and', 'not', '<', '>', 'ghes', 'fpt', 'ghec', '!=', '=']) function checkString( string: string, features: Set, { page, filePath, languageCode, verbose = false, }: { page?: Page; filePath?: string; languageCode?: string; verbose?: boolean } = {}, ) { try { // The reason for the `noCache: true` is that we're going to be sending // a LOT of different strings in and the cache will fill up rapidly // when testing every possible string in every possible language for // every page. const tokens = getLiquidTokens(string, { noCache: true }).filter( (token): token is TagToken => token.kind === TokenKind.Tag, ) for (const token of tokens) { if (token.name === 'ifversion' || token.name === 'elsif') { for (const arg of token.args.split(/\s+/)) { if (IGNORE_ARGS.has(arg)) continue if (isFloat(arg)) continue if (features.has(arg)) { features.delete(arg) } } } } } catch (error) { if (error instanceof TokenizationError) { // If it happens in English, it's a serious error if (languageCode === 'en') throw error // The translation might, currently, have corrupted liquid // So treat it as a string if (verbose) console.log( `TokenizationError in ${page ? page.fullPath : filePath}. Treating ${page ? page.fullPath : filePath} as a string and using regex`, ) for (const feature of Array.from(findByRegex(features, string))) { features.delete(feature) } } else { throw error } } } function findByRegex(features: Set, string: string) { const found = new Set() for (const match of string.match(/\{%\s*(ifversion|elsif)\s*(.*?)\s*%\}/g) || []) { for (const feature of Array.from(features)) { const regex = new RegExp(`\\s${escapeRegex(feature)}(\\s|%)`, 'i') if (regex.test(match)) { found.add(feature) } } } return found } const test = findByRegex( new Set(['placeholder', 'foo-bar']), ` placeholder {%ifversion placeholder-foo or fpt%} {% elsif not-placeholder %} {% elsif foo-bar%} {%endif %} {% data reusables.enterprise-migration-tool.placeholder-table %} {% data placeholder %} `, ) console.assert(test.has('foo-bar'), test.toString()) console.assert(!test.has('placeholder'), test.toString()) function escapeRegex(string: string) { return string.replace(/[/\-\\^$*+?.()|[\]{}]/g, '\\$&') } function isFloat(x: any) { return !!(Number(x) + 1) } strictEqual(isFloat('1.2'), true) strictEqual(isFloat('10'), true) strictEqual(isFloat('notatall'), false) strictEqual(isFloat('2fa'), false)