File size: 2,267 Bytes
05a5750 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 | import os
docs_path = r'd:\aicoding\kaiyuan\v2\docs'
# Common corruptions to fix in documentation
REPLACEMENTS = {
'🇺🇸': '🇺🇸',
'ðŸ‡Â🇰': '🇭🇰',
'🇪🇸': '🇪🇸',
'🇫🇷': '🇫🇷',
'🇩🇪': '🇩🇪',
'🇯🇵': '🇯🇵',
'🇰🇷': '🇰🇷',
'🇸🇦': '🇸🇦',
'🇵🇹': '🇵🇹',
'—': '—',
'•': '•',
'©': '©',
'Français': 'Français',
'Español': 'Español',
'Português': 'Português',
'日本誠': '日本語',
'ÕœêµÂ얠': '한국어',
'繠體ä¸Â文': '繁體中文',
'العربية': 'العربية',
'\ufffd': '' # Remove replacement characters
}
def purify_docs():
# Scan docs/ directory
for root, dirs, files in os.walk(docs_path):
for file in files:
if file.endswith('.md'):
process_file(os.path.join(root, file))
# Scan root directory for specific markdown files
root_path = r'd:\aicoding\kaiyuan\v2'
root_md_files = ['README.md', 'WHITEPAPER.md', 'RELEASE_V1.md', 'SECURITY.md']
for file in root_md_files:
filepath = os.path.join(root_path, file)
if os.path.exists(filepath):
process_file(filepath)
def process_file(filepath):
print(f"Purifying: {filepath}")
with open(filepath, 'rb') as f:
data = f.read()
try:
content = data.decode('utf-8')
except UnicodeDecodeError:
content = data.decode('latin-1')
# Apply replacements
for old, new in REPLACEMENTS.items():
content = content.replace(old, new)
# Write back as clean UTF-8
with open(filepath, 'w', encoding='utf-8', newline='\n') as f:
f.write(content)
if __name__ == "__main__":
purify_docs()
print("\nAll documentation files have been purified.")
|