github-docs-arabic-enhanced / src /languages /lib /correct-translation-content.ts
AbdulElahGwaith's picture
Upload folder using huggingface_hub
88df9e4 verified
/**
* A lot of translations have minor corruptions that will lead to rendering
* failing (and having to rely on English fallback). Many of these are
* easy to manually correct for.
*
* This function is a temporary solution to correct for these corruptions.
* It looks for easy "low hanging fruit" that we can correct for.
*
*/
interface CorrectionContext {
code?: string
dottedPath?: string
relativePath?: string
[key: string]: any
}
export function correctTranslatedContentStrings(
content: string,
englishContent: string,
context: CorrectionContext = {},
): string {
// A lot of translations have corruptions around the AUTOTITLE links.
// We've requested that these are corrected back but as a temporary
// solution we'll manually recover now.
// See internal issue #2762
// In late 2023, search in the translations repos if these things are
// still happening and if not, the following lines can be removed.
content = content.replaceAll('[AUTOTITLE"을 참조하세요]', '[AUTOTITLE]')
content = content.replaceAll('[AUTOTITLE"을]', '[AUTOTITLE]')
content = content.replaceAll('["AUTOTITLE]', '"[AUTOTITLE]')
content = content.replaceAll('[AUTOTITLE"을 참조하세요.](', '[AUTOTITLE](')
content = content.replaceAll('[ AUTOTITLE](', '[AUTOTITLE](')
content = content.replaceAll('[ "AUTOTITLE](', '[AUTOTITLE](')
content = content.replaceAll('[«AUTOTITLE»](', '[AUTOTITLE](')
// We've seen a lot of these across different languages.
content = content.replaceAll('{{% octicon', '{% octicon')
content = content.replaceAll('{{%octicon', '{% octicon')
content = content.replaceAll('{{% endif %}', '{% endif %}')
content = content.replaceAll('{{%endif %}', '{% endif %}')
// For a short while we injected `replacedomain` into code snippets
// to activate the Domain Edit functionality. That was in `main` for a
// while and was later removed in English. But during that window of
// time, some translations picked it up. Let's remove it. For now.
// The day we re-instate editable domain, delete these lines.
if (content.includes('replacedomain')) {
content = content.replaceAll('```text replacedomain copy', '```text copy')
content = content.replaceAll('```shell replacedomain', '```shell')
}
if (context.code === 'ru') {
// Low-hanging fruit for the data tag
content = content.replaceAll('{% данных variables', '{% data variables')
content = content.replaceAll('{% данными variables', '{% data variables')
content = content.replaceAll('{% данных организации variables', '{% data variables')
content = content.replaceAll('{% данным variables.', '{% data variables.')
content = content.replaceAll('{% данные variables.', '{% data variables.')
content = content.replaceAll('{% данных reusables', '{% data reusables')
content = content.replaceAll('{% данными reusables', '{% data reusables')
content = content.replaceAll('{% variables.', '{% data variables.')
content = content.replaceAll('{% необработанного %}', '{% raw %}')
content = content.replaceAll('{%- ifversion fpt или ghec %}', '{%- ifversion fpt or ghec %}')
content = content.replaceAll('{% ifversion fpt или ghec %}', '{% ifversion fpt or ghec %}')
content = content.replaceAll('{% endif _%}', '{% endif %}')
content = content.replaceAll('{% конечным %}', '{% endif %}')
content = content.replaceAll('{% переменных данных.', '{% data variables.')
content = content.replaceAll('{% повторно используемых данных.', '{% data reusables.')
content = content.replaceAll('{% примечание %}', '{% note %}')
content = content.replaceAll('{% конечных головщиков %}', '{% endrowheaders %}')
content = content.replaceAll('{% данных для повторного использования.', '{% data reusables.')
content = content.replaceAll('{% еще %}', '{% else %}')
content = content.replaceAll('{% необработанные %}', '{% raw %}')
content = content.replaceAll('{% подсказки %}', '{% tip %}')
// Fix YAML quote issues in UI files. Specifically the disclaimer href attribute
// href="...}> -> href="...">
content = content.replace(/href="([^"]*)}>/g, 'href="$1">')
// Fix double quotes in Russian YAML files that cause parsing errors
// ""https:// -> "https://
content = content.replace(/href=""https:\/\//g, 'href="https://')
// Fix empty HTML tags that cause YAML parsing issues
content = content.replaceAll('<b></b>', '')
content = content.replaceAll('<u></u>', '')
// Fix specific Russian UI YAML issues causing 502 errors
// Remove empty bold tags from early_access notice
content = content.replace(/early_access:\s*"([^"]*)<b><\/b>([^"]*)"/, 'early_access: "$1$2"')
// Remove empty underline tags from privacy disclaimer
content = content.replace(/(privacy_disclaimer:[^<]*)<u><\/u>/g, '$1')
// For the rather custom Russian translation of
// the content/get-started/learning-about-github/github-glossary.md page
// These string replacements speak for themselves.
content = content.replaceAll(
'{% для глоссария в глоссариях %}',
'{% for glossary in glossaries %}',
)
content = content.replaceAll('{{ глоссарий.term }}', '{{ glossary.term }}')
content = content.replaceAll('{{ глоссарий.description }}', '{{ glossary.description }}')
}
if (context.code === 'ja') {
// Low-hanging fruit for the data tag
content = content.replaceAll('{% データ variables', '{% data variables')
content = content.replaceAll('{% データvariables', '{% data variables')
// Fix specific issue likely causing 502 errors
// Remove trailing quote from the problematic translation
content = content.replace(
/asked_too_many_times:\s*申し訳ありません。短い時間に質問が多すぎます。\s*しばらく待ってからもう一度やり直してください。"\s*$/gm,
'asked_too_many_times: 申し訳ありません。短い時間に質問が多すぎます。 しばらく待ってからもう一度やり直してください。',
)
// Internal issue #4160
content = content.replaceAll(
'- % data variables.product.prodname_copilot_enterprise %}',
'- {% data variables.copilot.copilot_enterprise %}',
)
// This might not be exclusive to Japanese but put here because, at
// the time of writing, it only happens on the Japanse translations.
// According to the Microsoft translation guidelines, they're not
// supposed to translate words that will be seen in the UI, but
// instead mention then like this:
//
// [Save changes](THE TRANSLATION OF "Save changes" IN JAPANESE)
//
// The problem is when these are wrapped in a deliberate Markdown link.
// For example:
//
// [[Save changes](THE TRANSLATION OF "Save changes" IN JAPANESE)](#some-section)
//
// A real observed example is:
//
// [[Allow deletions](削除を許可)](#allow-deletions)
//
// Here, because "削除を許可" contains no spaces, the Markdown parser
// thinks "削除を許可" is the URL! But in actuality,
// `[Allow deletions](削除を許可)` is the text and `#allow-deletions`
// is the URL.
// This problem does not exhibit if the text "削除を許可" were to contain
// a space character. But we can't assume that we can just add a space.
// For example "削除 を許可" would be incorrect. And where do you put the
// space? Between which characters.
// Instead, we can inject a "hair space" whitespace character between
// the `]` and the `(`. Then, the Markdown processor does not get confused
// and the link is rendered correctly.
// The `\u200A` is the "hair space" character. Technically whitespace
// but not wide enough to visually appear as a space.
content = content.replace(/\[(\[.*?\])(\(\S+\)\]\()/g, '[$1\u200A$2')
}
if (context.code === 'zh') {
// Low-hanging fruit for the data tag
content = content.replaceAll('{% 数据variables', '{% data variables')
}
if (context.code === 'ko') {
// Low-hanging fruit for the data tag
content = content.replaceAll('{% 데이터 variables', '{% data variables')
content = content.replaceAll('{% 데이터 reusables.', '{% data reusables.')
// For the rather custom Korean translation of github-glossary.md
// Let's try to salvage based on what's in
// docs-internal.ko-kr/content/get-started/learning-about-github/github-glossary.md
// as of September 2023.
content = content.replaceAll('용어집 %}의 용어집에 대한 {%', '{% for glossary in glossaries %}')
content = content.replaceAll('{{ 용어집.term }}', '{{ glossary.term }}')
content = content.replaceAll('{{ 용어집.description }}', '{{ glossary.description }}')
}
if (context.code === 'es') {
// Seen these a few times in the Spanish translations.
content = content.replaceAll('{% vulnerables variables.', '{% data variables.')
}
// We have seen a lot of Markdown tables, that may have Liquid tags
// (like `{% ifversion ... %}`) within them lose the linebreak between
// the heading and the first row marker.
// For example:
//
// | **Sprache** | **Ökosystem** | **Manifestdatei** | **Unterstützter Abhängigkeitsbereich** | |:---|:---:|:---:|:---|{% ifversion dep
//
// The equivalent English for that is:
//
// | **Language** | **Ecosystem** | **Manifest file** | **Dependency scope supported** |
// |:---|:---:|:---:|:---|
// {%- ifversion dependency-graph-dart-support %}
//
// Let's inject these newline characters if found in the English content.
if (content.includes('| |:---|:') && englishContent.includes('|\n|:---|')) {
content = content.replaceAll('| |:---|:', '|\n|:---|:')
}
if (content.includes('|:---|{% ifversion') && englishContent.includes('|:---|\n{%- ifversion')) {
content = content.replaceAll('|:---|{% ifversion', '|:---|\n{%- ifversion')
}
if (context.dottedPath === 'reusables.copilot.differences-cfi-cfb-table') {
// As of Dec 2023, the French translation has a subtle typo.
// This string replace is highly specific and clearly only going to
// work if the exact French, incorrect, translation is still in use.
// Consider deleting these lines in mid-2024 because hopefully by then
// the translation mistake, which we've reported, will be corrected.
content = content.replace(
'{% data variables.copilot.cfi_price_per_month %} par utilisateur et par mois',
'{% data variables.copilot.cfb_price_per_month %} par utilisateur et par mois.',
)
}
// These are common mistakes made by translations that are specific.
// It's prevalent in all translations so that's why it's not per-language.
// It's important though that this happens after the other per-language
// specific fixes above. For example `{{% данных variables...`
content = content.replaceAll('{{% data variables.', '{% data variables.')
content = content.replaceAll('{%%data variables.', '{% data variables.')
content = content.replaceAll('{{% data reusables.', '{% data reusables.')
content = content.replaceAll('{%%data reusables.', '{% data reusables.')
content = content.replaceAll('{{% ifversion ', '{% ifversion ')
// A lot of Liquid tags lose their linebreak after the `}`
// result in formatting problems, especially around Markdown tables.
// This code here, compares each Liquid statement, in the translation,
// and tests if it appears like that but with a newline in the English.
// English example:
//
// {%- ifversion ghes %}
// | Thing | ✔️ |
// {%- endif %}
//
// Translation example:
//
// {%- ifversion ghes %} | Thing | ✔️ | {%- endif %}
//
// There exists the risk that different Liquid statements gets compared
// different Liquid statements in the English, but the risk is worth
// taking because even if this accidentally introduces a newline, it's
// unlikely to cause a problem. At worst that a sentence displays on its
// own paragraph.
content = content.replace(/\{%(.+?)%\} /g, (match) => {
if (match.lastIndexOf('{%') > 0) {
// For example:
//
// `{% bla bla %}, and {% foo bar %} `
//
// Our regex is not greedy, but technically, if you look closely
// you'll see this is the first match that starts with `{%` and
// ends with `%} `. Let's skip these.
return match
}
const withLinebreak = `${match.slice(0, -1)}\n`
if (englishContent.includes(withLinebreak) && !englishContent.includes(match)) {
return withLinebreak
}
return match
})
// The above corrections deepend on looking for `{% foo %} ` and replacing
// it with `{% foo %}\n`. ...if `{% foo %}\n` was in the English
// content and `{% foo %} ` was *not*.
// However we see a lot of cases of this:
//
// ... {% endif %} | First Column ...
//
// Which needs to become this:
//
// ... {% endif %}
// | First Column ...
//
// And since `{% endif %}` is such a common Liquid tag we can't reply
// on looking for it with `{% endif %}\n` in the English content.
content = content.replace(/\{% endif %\} \| /g, (match) => {
const potentiallyBetter = '{% endif %}\n| '
if (englishContent.includes(potentiallyBetter)) {
return potentiallyBetter
}
return match
})
// All too often we see translations that look like this:
//
// | Qualifizierer | Beschreibung | | -------- | -------- | {% ifversion ghec or ghes > 3.8 %} | `advanced-security:enabled` | Zeigt Repositorys an, für die {% data variables.product.prodname_GH_advanced_security %} aktiviert wurde | {% endif %} | `code-scanning-pull-request-alerts:enabled`| Zeigt Repositorys an, für die die {% data variables.product.prodname_code_scanning %} zur Ausführung bei Pull Requests konfiguriert wurde | | `dependabot-security-updates:enabled` | Zeigt Repositorys an, für die {% data variables.product.prodname_dependabot %}-Sicherheitsupdates aktiviert wurden | | `secret-scanning-push-protection:enabled` | Zeigt Repositorys an, für die der Pushschutz für die {% data variables.product.prodname_secret_scanning %} aktiviert wurde | {% endif %}
//
// Yes, that's one very long line. Notice how all the necessary linebreaks
// are suddenly gone.
content = content.replaceAll(' | | ', ' |\n| ')
// This is a bit of a hack, but it works.
// It looks for patterns like this:
//
// Some words --------|-------|{
//
// And from that it tries to convert it to:
//
// Some words
// --------|-------|{
//
// But because it's quite a broad solution specifically around any
// Markdown table syntax, let's be extra careful and only apply it
// to the select few pages with known problems.
if (context.relativePath?.endsWith('scopes-for-oauth-apps.md')) {
if (context.code === 'pt') {
// As of Aug 2023, the Portuguese translation seems to have lost the
// `|` characters in their Markdown table syntax.
content = content.replace(/(\w)(\s-+\s-+\s){%/g, (whole, start, rest) => {
return `${start}\n${rest.replace(/\s/g, '|')}`
})
}
content = content.replace(/(\S\s*)(--+\|--+\|{)/, (whole, start, rest) => {
return `${start}\n${rest}`
})
}
// We *used* to mention this key within an English sentence. But that
// whole sentence is removed (from the English) and thus we need to remove
// same sentence from the translations as well.
// Ideally, the translators immediately notice the change but we can't
// guarantee that turnaround time. So we string replace it with an
// empty string.
// NOTE! By late 2024 all translations *should* have caught up with
// English translation (which removed the sentence). Then we can
// delete all of this code.
// See internal issue docs-content#13361
if (
context.relativePath ===
'authentication/managing-commit-signature-verification/about-commit-signature-verification.md'
) {
const keyString = '5DE3 E050 9C47 EA3C F04A 42D3 4AEE 18F8 3AFD EB23'
const translatedSentences = [
// ru
`Полный отпечаток ключа\u00A0\u2014 \`${keyString}\`.`,
// ko
`키의 전체 지문은 \`${keyString}\`입니다.`,
// es
`La huella digital completa de la clave es \`${keyString}\`.`,
// zh
`密钥的完整指纹是 \`${keyString}\`。`,
// pt
`A impressão digital completa da chave é \`${keyString}\`.`,
// ja
`キーの完全な指紋は、\`${keyString}\` です。`,
// fr
`L\u2019empreinte digitale complète de la clé est \`${keyString}\`.`,
// de
`Der vollständige Fingerabdruck des Schlüssels ist \`${keyString}\`.`,
]
for (const translatedSentence of translatedSentences) {
if (content.includes(translatedSentence)) {
content = content.replace(translatedSentence, '')
break
}
}
if (content.includes(keyString)) {
// NOTE! These lines are for debugging and we can delete them once
// we're confident the keyString is no longer present in any
// translation.
// for (const line of content.split('\n')) {
// if (line.includes(keyString)) {
// console.log({ [context.code]: line })
// }
// }
// throw new Error('Key string is still in there!')
content = content.replace(keyString, '[redacted in translation]')
}
}
if (content.includes('{{%')) {
const lines = content.split('\n')
for (let i = 0; i < lines.length; i++) {
const line = lines[i]
if (line.includes('{{%') && !line.includes('{{{% endraw')) {
console.log(context.code, 'context.relativePath', context.relativePath)
console.log(i, line)
}
}
}
return content
}