File size: 2,267 Bytes
05a5750
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import os

docs_path = r'd:\aicoding\kaiyuan\v2\docs'

# Common corruptions to fix in documentation
REPLACEMENTS = {
    '🇺🇸': '🇺🇸',
    '🇭🇰': '🇭🇰',
    '🇪🇸': '🇪🇸',
    '🇫🇷': '🇫🇷',
    '🇩🇪': '🇩🇪',
    '🇯🇵': '🇯🇵',
    '🇰🇷': '🇰🇷',
    '🇸🇦': '🇸🇦',
    '🇵🇹': '🇵🇹',
    '—': '—',
    '•': '•',
    '©': '©',
    'Français': 'Français',
    'Español': 'Español',
    'Português': 'Português',
    '日本誠': '日本語',
    '한국얠': '한국어',
    '繠體中文': '繁體中文',
    'العربية': 'العربية',
    '\ufffd': '' # Remove replacement characters
}

def purify_docs():
    # Scan docs/ directory
    for root, dirs, files in os.walk(docs_path):
        for file in files:
            if file.endswith('.md'):
                process_file(os.path.join(root, file))
    
    # Scan root directory for specific markdown files
    root_path = r'd:\aicoding\kaiyuan\v2'
    root_md_files = ['README.md', 'WHITEPAPER.md', 'RELEASE_V1.md', 'SECURITY.md']
    for file in root_md_files:
        filepath = os.path.join(root_path, file)
        if os.path.exists(filepath):
            process_file(filepath)

def process_file(filepath):
    print(f"Purifying: {filepath}")
    with open(filepath, 'rb') as f:
        data = f.read()
    
    try:
        content = data.decode('utf-8')
    except UnicodeDecodeError:
        content = data.decode('latin-1')
    
    # Apply replacements
    for old, new in REPLACEMENTS.items():
        content = content.replace(old, new)
    
    # Write back as clean UTF-8
    with open(filepath, 'w', encoding='utf-8', newline='\n') as f:
        f.write(content)

if __name__ == "__main__":
    purify_docs()
    print("\nAll documentation files have been purified.")