github-docs-arabic-enhanced / src /languages /lib /correct-translation-content.ts

Upload folder using huggingface_hub

88df9e4 verified about 1 month ago

18.2 kB

	/**
	* A lot of translations have minor corruptions that will lead to rendering
	* failing (and having to rely on English fallback). Many of these are
	* easy to manually correct for.
	*
	* This function is a temporary solution to correct for these corruptions.
	* It looks for easy "low hanging fruit" that we can correct for.
	*
	*/

	interface CorrectionContext {
	code?: string
	dottedPath?: string
	relativePath?: string
	[key: string]: any
	}

	export function correctTranslatedContentStrings(
	content: string,
	englishContent: string,
	context: CorrectionContext = {},
	): string {
	// A lot of translations have corruptions around the AUTOTITLE links.
	// We've requested that these are corrected back but as a temporary
	// solution we'll manually recover now.
	// See internal issue #2762
	// In late 2023, search in the translations repos if these things are
	// still happening and if not, the following lines can be removed.
	content = content.replaceAll('[AUTOTITLE"을 참조하세요]', '[AUTOTITLE]')
	content = content.replaceAll('[AUTOTITLE"을]', '[AUTOTITLE]')
	content = content.replaceAll('["AUTOTITLE]', '"[AUTOTITLE]')
	content = content.replaceAll('[AUTOTITLE"을 참조하세요.](', '[AUTOTITLE](')
	content = content.replaceAll('[ AUTOTITLE](', '[AUTOTITLE](')
	content = content.replaceAll('[ "AUTOTITLE](', '[AUTOTITLE](')
	content = content.replaceAll('[«AUTOTITLE»](', '[AUTOTITLE](')

	// We've seen a lot of these across different languages.
	content = content.replaceAll('{{% octicon', '{% octicon')
	content = content.replaceAll('{{%octicon', '{% octicon')
	content = content.replaceAll('{{% endif %}', '{% endif %}')
	content = content.replaceAll('{{%endif %}', '{% endif %}')

	// For a short while we injected `replacedomain` into code snippets
	// to activate the Domain Edit functionality. That was in `main` for a
	// while and was later removed in English. But during that window of
	// time, some translations picked it up. Let's remove it. For now.
	// The day we re-instate editable domain, delete these lines.
	if (content.includes('replacedomain')) {
	content = content.replaceAll('```text replacedomain copy', '```text copy')
	content = content.replaceAll('```shell replacedomain', '```shell')
	}

	if (context.code === 'ru') {
	// Low-hanging fruit for the data tag
	content = content.replaceAll('{% данных variables', '{% data variables')
	content = content.replaceAll('{% данными variables', '{% data variables')
	content = content.replaceAll('{% данных организации variables', '{% data variables')
	content = content.replaceAll('{% данным variables.', '{% data variables.')
	content = content.replaceAll('{% данные variables.', '{% data variables.')
	content = content.replaceAll('{% данных reusables', '{% data reusables')
	content = content.replaceAll('{% данными reusables', '{% data reusables')
	content = content.replaceAll('{% variables.', '{% data variables.')
	content = content.replaceAll('{% необработанного %}', '{% raw %}')
	content = content.replaceAll('{%- ifversion fpt или ghec %}', '{%- ifversion fpt or ghec %}')
	content = content.replaceAll('{% ifversion fpt или ghec %}', '{% ifversion fpt or ghec %}')
	content = content.replaceAll('{% endif _%}', '{% endif %}')
	content = content.replaceAll('{% конечным %}', '{% endif %}')
	content = content.replaceAll('{% переменных данных.', '{% data variables.')
	content = content.replaceAll('{% повторно используемых данных.', '{% data reusables.')
	content = content.replaceAll('{% примечание %}', '{% note %}')
	content = content.replaceAll('{% конечных головщиков %}', '{% endrowheaders %}')
	content = content.replaceAll('{% данных для повторного использования.', '{% data reusables.')
	content = content.replaceAll('{% еще %}', '{% else %}')
	content = content.replaceAll('{% необработанные %}', '{% raw %}')
	content = content.replaceAll('{% подсказки %}', '{% tip %}')

	// Fix YAML quote issues in UI files. Specifically the disclaimer href attribute
	// href="...}> -> href="...">
	content = content.replace(/href="([^"]*)}>/g, 'href="$1">')

	// Fix double quotes in Russian YAML files that cause parsing errors
	// ""https:// -> "https://
	content = content.replace(/href=""https:\/\//g, 'href="https://')

	// Fix empty HTML tags that cause YAML parsing issues
	content = content.replaceAll('<b></b>', '')
	content = content.replaceAll('<u></u>', '')

	// Fix specific Russian UI YAML issues causing 502 errors
	// Remove empty bold tags from early_access notice
	content = content.replace(/early_access:\s"([^"])<b><\/b>([^"]*)"/, 'early_access: "$1$2"')

	// Remove empty underline tags from privacy disclaimer
	content = content.replace(/(privacy_disclaimer:[^<]*)<u><\/u>/g, '$1')

	// For the rather custom Russian translation of
	// the content/get-started/learning-about-github/github-glossary.md page
	// These string replacements speak for themselves.
	content = content.replaceAll(
	'{% для глоссария в глоссариях %}',
	'{% for glossary in glossaries %}',
	)
	content = content.replaceAll('{{ глоссарий.term }}', '{{ glossary.term }}')
	content = content.replaceAll('{{ глоссарий.description }}', '{{ glossary.description }}')
	}

	if (context.code === 'ja') {
	// Low-hanging fruit for the data tag
	content = content.replaceAll('{% データ variables', '{% data variables')
	content = content.replaceAll('{% データvariables', '{% data variables')

	// Fix specific issue likely causing 502 errors
	// Remove trailing quote from the problematic translation
	content = content.replace(
	/asked_too_many_times:\s申し訳ありません。短い時間に質問が多すぎます。\sしばらく待ってからもう一度やり直してください。"\s*$/gm,
	'asked_too_many_times: 申し訳ありません。短い時間に質問が多すぎます。しばらく待ってからもう一度やり直してください。',
	)

	// Internal issue #4160
	content = content.replaceAll(
	'- % data variables.product.prodname_copilot_enterprise %}',
	'- {% data variables.copilot.copilot_enterprise %}',
	)

	// This might not be exclusive to Japanese but put here because, at
	// the time of writing, it only happens on the Japanse translations.
	// According to the Microsoft translation guidelines, they're not
	// supposed to translate words that will be seen in the UI, but
	// instead mention then like this:
	//
	// [Save changes](THE TRANSLATION OF "Save changes" IN JAPANESE)
	//
	// The problem is when these are wrapped in a deliberate Markdown link.
	// For example:
	//
	// [[Save changes](THE TRANSLATION OF "Save changes" IN JAPANESE)](#some-section)
	//
	// A real observed example is:
	//
	// [[Allow deletions](削除を許可)](#allow-deletions)
	//
	// Here, because "削除を許可" contains no spaces, the Markdown parser
	// thinks "削除を許可" is the URL! But in actuality,
	// `[Allow deletions](削除を許可)` is the text and `#allow-deletions`
	// is the URL.
	// This problem does not exhibit if the text "削除を許可" were to contain
	// a space character. But we can't assume that we can just add a space.
	// For example "削除を許可" would be incorrect. And where do you put the
	// space? Between which characters.
	// Instead, we can inject a "hair space" whitespace character between
	// the `]` and the `(`. Then, the Markdown processor does not get confused
	// and the link is rendered correctly.
	// The `\u200A` is the "hair space" character. Technically whitespace
	// but not wide enough to visually appear as a space.
	content = content.replace(/\[(\[.*?\])($\S+$\]\()/g, '[$1\u200A$2')
	}

	if (context.code === 'zh') {
	// Low-hanging fruit for the data tag
	content = content.replaceAll('{% 数据variables', '{% data variables')
	}

	if (context.code === 'ko') {
	// Low-hanging fruit for the data tag
	content = content.replaceAll('{% 데이터 variables', '{% data variables')
	content = content.replaceAll('{% 데이터 reusables.', '{% data reusables.')

	// For the rather custom Korean translation of github-glossary.md
	// Let's try to salvage based on what's in
	// docs-internal.ko-kr/content/get-started/learning-about-github/github-glossary.md
	// as of September 2023.
	content = content.replaceAll('용어집 %}의 용어집에 대한 {%', '{% for glossary in glossaries %}')
	content = content.replaceAll('{{ 용어집.term }}', '{{ glossary.term }}')
	content = content.replaceAll('{{ 용어집.description }}', '{{ glossary.description }}')
	}

	if (context.code === 'es') {
	// Seen these a few times in the Spanish translations.
	content = content.replaceAll('{% vulnerables variables.', '{% data variables.')
	}

	// We have seen a lot of Markdown tables, that may have Liquid tags
	// (like `{% ifversion ... %}`) within them lose the linebreak between
	// the heading and the first row marker.
	// For example:
	//
	// \| Sprache \| Ökosystem \| Manifestdatei \| Unterstützter Abhängigkeitsbereich \| \|:---\|:---:\|:---:\|:---\|{% ifversion dep
	//
	// The equivalent English for that is:
	//
	// \| Language \| Ecosystem \| Manifest file \| Dependency scope supported \|
	// \|:---\|:---:\|:---:\|:---\|
	// {%- ifversion dependency-graph-dart-support %}
	//
	// Let's inject these newline characters if found in the English content.
	if (content.includes('\| \|:---\|:') && englishContent.includes('\|\n\|:---\|')) {
	content = content.replaceAll('\| \|:---\|:', '\|\n\|:---\|:')
	}
	if (content.includes('\|:---\|{% ifversion') && englishContent.includes('\|:---\|\n{%- ifversion')) {
	content = content.replaceAll('\|:---\|{% ifversion', '\|:---\|\n{%- ifversion')
	}

	if (context.dottedPath === 'reusables.copilot.differences-cfi-cfb-table') {
	// As of Dec 2023, the French translation has a subtle typo.
	// This string replace is highly specific and clearly only going to
	// work if the exact French, incorrect, translation is still in use.
	// Consider deleting these lines in mid-2024 because hopefully by then
	// the translation mistake, which we've reported, will be corrected.
	content = content.replace(
	'{% data variables.copilot.cfi_price_per_month %} par utilisateur et par mois',
	'{% data variables.copilot.cfb_price_per_month %} par utilisateur et par mois.',
	)
	}

	// These are common mistakes made by translations that are specific.
	// It's prevalent in all translations so that's why it's not per-language.
	// It's important though that this happens after the other per-language
	// specific fixes above. For example `{{% данных variables...`
	content = content.replaceAll('{{% data variables.', '{% data variables.')
	content = content.replaceAll('{%%data variables.', '{% data variables.')
	content = content.replaceAll('{{% data reusables.', '{% data reusables.')
	content = content.replaceAll('{%%data reusables.', '{% data reusables.')
	content = content.replaceAll('{{% ifversion ', '{% ifversion ')

	// A lot of Liquid tags lose their linebreak after the `}`
	// result in formatting problems, especially around Markdown tables.
	// This code here, compares each Liquid statement, in the translation,
	// and tests if it appears like that but with a newline in the English.
	// English example:
	//
	// {%- ifversion ghes %}
	// \| Thing \| ✔️ \|
	// {%- endif %}
	//
	// Translation example:
	//
	// {%- ifversion ghes %} \| Thing \| ✔️ \| {%- endif %}
	//
	// There exists the risk that different Liquid statements gets compared
	// different Liquid statements in the English, but the risk is worth
	// taking because even if this accidentally introduces a newline, it's
	// unlikely to cause a problem. At worst that a sentence displays on its
	// own paragraph.
	content = content.replace(/\{%(.+?)%\} /g, (match) => {
	if (match.lastIndexOf('{%') > 0) {
	// For example:
	//
	// `{% bla bla %}, and {% foo bar %} `
	//
	// Our regex is not greedy, but technically, if you look closely
	// you'll see this is the first match that starts with `{%` and
	// ends with `%} `. Let's skip these.
	return match
	}

	const withLinebreak = `${match.slice(0, -1)}\n`
	if (englishContent.includes(withLinebreak) && !englishContent.includes(match)) {
	return withLinebreak
	}
	return match
	})
	// The above corrections deepend on looking for `{% foo %} ` and replacing
	// it with `{% foo %}\n`. ...if `{% foo %}\n` was in the English
	// content and `{% foo %} ` was not.
	// However we see a lot of cases of this:
	//
	// ... {% endif %} \| First Column ...
	//
	// Which needs to become this:
	//
	// ... {% endif %}
	// \| First Column ...
	//
	// And since `{% endif %}` is such a common Liquid tag we can't reply
	// on looking for it with `{% endif %}\n` in the English content.
	content = content.replace(/\{% endif %\} \\| /g, (match) => {
	const potentiallyBetter = '{% endif %}\n\| '
	if (englishContent.includes(potentiallyBetter)) {
	return potentiallyBetter
	}
	return match
	})

	// All too often we see translations that look like this:
	//
	// \| Qualifizierer \| Beschreibung \| \| -------- \| -------- \| {% ifversion ghec or ghes > 3.8 %} \| `advanced-security:enabled` \| Zeigt Repositorys an, für die {% data variables.product.prodname_GH_advanced_security %} aktiviert wurde \| {% endif %} \| `code-scanning-pull-request-alerts:enabled`\| Zeigt Repositorys an, für die die {% data variables.product.prodname_code_scanning %} zur Ausführung bei Pull Requests konfiguriert wurde \| \| `dependabot-security-updates:enabled` \| Zeigt Repositorys an, für die {% data variables.product.prodname_dependabot %}-Sicherheitsupdates aktiviert wurden \| \| `secret-scanning-push-protection:enabled` \| Zeigt Repositorys an, für die der Pushschutz für die {% data variables.product.prodname_secret_scanning %} aktiviert wurde \| {% endif %}
	//
	// Yes, that's one very long line. Notice how all the necessary linebreaks
	// are suddenly gone.
	content = content.replaceAll(' \| \| ', ' \|\n\| ')

	// This is a bit of a hack, but it works.
	// It looks for patterns like this:
	//
	// Some words --------\|-------\|{
	//
	// And from that it tries to convert it to:
	//
	// Some words
	// --------\|-------\|{
	//
	// But because it's quite a broad solution specifically around any
	// Markdown table syntax, let's be extra careful and only apply it
	// to the select few pages with known problems.
	if (context.relativePath?.endsWith('scopes-for-oauth-apps.md')) {
	if (context.code === 'pt') {
	// As of Aug 2023, the Portuguese translation seems to have lost the
	// `\|` characters in their Markdown table syntax.
	content = content.replace(/(\w)(\s-+\s-+\s){%/g, (whole, start, rest) => {
	return `${start}\n${rest.replace(/\s/g, '\|')}`
	})
	}
	content = content.replace(/(\S\s*)(--+\\|--+\\|{)/, (whole, start, rest) => {
	return `${start}\n${rest}`
	})
	}

	// We used to mention this key within an English sentence. But that
	// whole sentence is removed (from the English) and thus we need to remove
	// same sentence from the translations as well.
	// Ideally, the translators immediately notice the change but we can't
	// guarantee that turnaround time. So we string replace it with an
	// empty string.
	// NOTE! By late 2024 all translations should have caught up with
	// English translation (which removed the sentence). Then we can
	// delete all of this code.
	// See internal issue docs-content#13361
	if (
	context.relativePath ===
	'authentication/managing-commit-signature-verification/about-commit-signature-verification.md'
	) {
	const keyString = '5DE3 E050 9C47 EA3C F04A 42D3 4AEE 18F8 3AFD EB23'
	const translatedSentences = [
	// ru
	`Полный отпечаток ключа\u00A0\u2014 \`${keyString}\`.`,
	// ko
	`키의 전체 지문은 \`${keyString}\`입니다.`,
	// es
	`La huella digital completa de la clave es \`${keyString}\`.`,
	// zh
	`密钥的完整指纹是 \`${keyString}\`。`,
	// pt
	`A impressão digital completa da chave é \`${keyString}\`.`,
	// ja
	`キーの完全な指紋は、\`${keyString}\` です。`,
	// fr
	`L\u2019empreinte digitale complète de la clé est \`${keyString}\`.`,
	// de
	`Der vollständige Fingerabdruck des Schlüssels ist \`${keyString}\`.`,
	]
	for (const translatedSentence of translatedSentences) {
	if (content.includes(translatedSentence)) {
	content = content.replace(translatedSentence, '')
	break
	}
	}
	if (content.includes(keyString)) {
	// NOTE! These lines are for debugging and we can delete them once
	// we're confident the keyString is no longer present in any
	// translation.
	// for (const line of content.split('\n')) {
	// if (line.includes(keyString)) {
	// console.log({ [context.code]: line })
	// }
	// }
	// throw new Error('Key string is still in there!')
	content = content.replace(keyString, '[redacted in translation]')
	}
	}

	if (content.includes('{{%')) {
	const lines = content.split('\n')
	for (let i = 0; i < lines.length; i++) {
	const line = lines[i]
	if (line.includes('{{%') && !line.includes('{{{% endraw')) {
	console.log(context.code, 'context.relativePath', context.relativePath)
	console.log(i, line)
	}
	}
	}

	return content
	}