github-docs-arabic-enhanced / src /languages /scripts /count-translation-corruptions.ts

Upload folder using huggingface_hub

88df9e4 verified 3 months ago

5.59 kB

	import path from 'path'
	import fs from 'fs'

	import { program } from 'commander'
	import chalk from 'chalk'
	import { TokenizationError } from 'liquidjs'
	import walk from 'walk-sync'

	import { getLiquidTokens } from '@/content-linter/lib/helpers/liquid-utils'
	import languages from '@/languages/lib/languages-server'
	import warmServer from '@/frame/lib/warm-server'
	import type { Site } from '@/types'
	import { correctTranslatedContentStrings } from '@/languages/lib/correct-translation-content'

	program
	.description('Tally the number of liquid corruptions in a translation')
	.argument('[language...]', 'language(s) to compare against')
	.action(main)
	program.parse(process.argv)

	type Reusables = Map<string, string>

	async function main(languageCodes: string[]) {
	const langCodes = languageCodes.length
	? languageCodes
	: Object.keys(languages).filter((x) => x !== 'en')
	const site = await warmServer(languageCodes.length ? ['en', ...langCodes] : [])

	// When checking reusables, we only want to check the files that
	// have an English equivalent.
	const reusables = getReusables()

	const totalErrors = new Map<string, number>()

	for (const languageCode of langCodes) {
	if (!(languageCode in languages)) {
	console.error(chalk.red(`Language ${languageCode} not found`))
	return process.exit(1)
	}
	if (languageCode === 'en') {
	console.error(chalk.red("Can't test in English ('en')"))
	return process.exit(1)
	}
	const { errors } = run(languageCode, site, reusables)
	for (const [error, count] of Array.from(errors.entries())) {
	totalErrors.set(error, (totalErrors.get(error) \|\| 0) + count)
	}
	}

	const sumTotal = Array.from(totalErrors.values()).reduce((acc, count) => acc + count, 0)
	console.log('\nGRAND TOTAL ERRORS:', sumTotal)
	}

	function getReusables(): Reusables {
	const reusables = new Map()
	const files = walk('data/reusables', {
	includeBasePath: true,
	globs: ['*/.md'],
	ignore: ['**/README.md'],
	})
	for (const file of files) {
	const content = fs.readFileSync(file, 'utf8')
	reusables.set(file, content)
	}
	return reusables
	}

	function run(languageCode: string, site: Site, englishReusables: Reusables) {
	const PADDING = 60
	const language = languages[languageCode as keyof typeof languages]

	console.log(`--- Tallying liquid corruptions in ${languageCode} (${language.name}) ---`)

	const pageList = site.pageList
	const errors = new Map<string, number>()
	const wheres = new Map<string, number>()
	const illegalTags = new Map<string, number>()

	function countError(error: TokenizationError, where: string) {
	const originalError = (error as any).originalError
	const errorString = originalError ? originalError.message : error.message
	if (errorString.includes('illegal tag syntax')) {
	const illegalTag = (error as any).token.content
	illegalTags.set(illegalTag, (illegalTags.get(illegalTag) \|\| 0) + 1)
	}
	errors.set(errorString, (errors.get(errorString) \|\| 0) + 1)
	wheres.set(where, (wheres.get(where) \|\| 0) + 1)
	}

	for (const page of pageList) {
	if (page.languageCode !== languageCode) continue

	const strings: string[][] = [
	['title', page.title],
	['shortTitle', page.shortTitle \|\| ''],
	['intro', page.intro \|\| ''],
	['markdown', page.markdown],
	].filter(([, string]) => Boolean(string))

	for (const [where, string] of strings) {
	try {
	getLiquidTokens(string)
	} catch (error) {
	if (error instanceof TokenizationError) {
	countError(error, where)
	} else {
	throw error
	}
	}
	}
	}

	for (const [relativePath, englishContent] of Array.from(englishReusables.entries())) {
	try {
	const filePath = path.join(language.dir, relativePath)
	const rawContent = fs.readFileSync(filePath, 'utf8')
	const correctedContent = correctTranslatedContentStrings(rawContent, englishContent, {
	code: languageCode,
	relativePath,
	})
	getLiquidTokens(correctedContent)
	} catch (error) {
	if (error instanceof TokenizationError) {
	countError(error, 'reusable')
	} else if (error instanceof Error && error.message.startsWith('ENOENT')) {
	continue
	} else {
	throw error
	}
	}
	}

	const flat = Array.from(errors.entries()).sort((a, b) => b[1] - a[1])
	const sumTotal = flat.reduce((acc, [, count]) => acc + count, 0)

	console.log('\nMost common errors')
	for (let i = 0; i < flat.length; i++) {
	const [error, count] = flat[i]
	console.log(`${i + 1}.`.padEnd(3), error.padEnd(PADDING), count)
	}
	console.log(`${'TOTAL:'.padEnd(3 + 1 + PADDING)}`, sumTotal)

	if (sumTotal) {
	const whereFlat = Array.from(wheres.entries()).sort((a, b) => b[1] - a[1])
	console.log('\nMost common places')
	for (let i = 0; i < whereFlat.length; i++) {
	const [error, count] = whereFlat[i]
	console.log(`${i + 1}.`.padEnd(3), error.padEnd(PADDING), count)
	}

	const illegalTagsFlat = Array.from(illegalTags.entries()).sort((a, b) => b[1] - a[1])
	if (illegalTagsFlat.reduce((acc, [, count]) => acc + count, 0)) {
	console.log('\nMost common illegal tags', illegalTagsFlat.length > 10 ? ' (Top 10)' : '')
	const topIllegalTags = illegalTagsFlat.slice(0, 10)
	for (let i = 0; i < topIllegalTags.length; i++) {
	const [error, count] = topIllegalTags[i]
	console.log(`${i + 1}.`.padEnd(3), error.padEnd(PADDING), count)
	}
	}
	}
	console.log('\n')

	return { errors }
	}