File size: 3,583 Bytes
05a5750
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import os
import re

filepath = r'd:\aicoding\kaiyuan\v2\index.html'

# Common corruptions to fix globally
GLOBAL_REPLACEMENTS = {
    '🇺🇸': '🇺🇸',
    '🇭🇰': '🇭🇰',
    '🇪🇸': '🇪🇸',
    '🇫🇷': '🇫🇷',
    '🇩🇪': '🇩🇪',
    '🇯🇵': '🇯🇵',
    '🇰🇷': '🇰🇷',
    '🇸🇦': '🇸🇦',
    '🇵🇹': '🇵🇹',
    '🌎': '🌍',
    '📢': '📢',
    '🌠': '🌍',
    '📠': '📄',
    '■': '▾',
    '▾': '▾',
    'â–?': '▾',
    'â–': '▾',
    'Français': 'Français',
    'Español': 'Español',
    'Português': 'Português',
    '日本誠': '日本語',
    '한국얠': '한국어',
    '繠體中文': '繁體中文',
    'العربية': 'العربية',
    '©': '©',
    '•': '•',
    '—': '—',
    '▾': '▾',
    '🇺🇸': '🇺🇸',
    '🇭🇰': '🇭🇰',
    '🇪🇸': '🇪🇸',
    '🇫🇷': '🇫🇷',
    '🇩🇪': '🇩🇪',
    '🇯🇵': '🇯🇵',
    '🇰🇷': '🇰🇷',
    '🇸🇦': '🇸🇦',
    '🇵🇹': '🇵🇹'
}

def audit_file():
    with open(filepath, 'rb') as f:
        data = f.read()
    
    # Try decoding to find the current mess
    try:
        content = data.decode('utf-8')
    except UnicodeDecodeError:
        content = data.decode('latin-1')

    # 1. Global Replacement of known artifacts
    for old, new in GLOBAL_REPLACEMENTS.items():
        content = content.replace(old, new)

    # 2. Fix Academic Keywords and other script-level corruptions
    content = content.replace('学你', '学位').replace('大学', '大学')
    content = content.replace('戠绩å ?', '成绩单').replace('诠书', '证书')
    content = content.replace('造堇', '造假')

    # 3. Structural Health Check
    # Ensure all main sections are present and clean
    if '<meta charset="UTF-8">' not in content:
        content = content.replace('<head>', '<head>\n    <meta charset="UTF-8">')
    
    # Fix the trailing garbage seen in the screenshot (?/div>)
    content = re.sub(r'\?\s*/div>', '</div>', content)
    content = re.sub(r'\?\s*/button>', '</button>', content)
    
    # Check for unclosed divs (rudimentary check)
    open_divs = content.count('<div')
    close_divs = content.count('</div')
    print(f'Div count check: Open={open_divs}, Close={close_divs}')
    
    if open_divs > close_divs:
        print('WARNING: Unclosed divs detected. Attempting to balance...')
        # This is risky but often it's just one missing at the end of a section
    
    # 4. Standardize all emojis
    content = content.replace('🔍', '🔍').replace('🛡️', '🛡️').replace('🌐', '🌐')

    # Write back clean
    with open(filepath, 'w', encoding='utf-8', newline='\n') as f:
        f.write(content)
    
    print('Deep Audit and Cleanup Complete.')

if __name__ == '__main__':
    audit_file()