File size: 3,583 Bytes
05a5750 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 | import os
import re
filepath = r'd:\aicoding\kaiyuan\v2\index.html'
# Common corruptions to fix globally
GLOBAL_REPLACEMENTS = {
'🇺🇸': '🇺🇸',
'ðŸ‡Â🇰': '🇭🇰',
'🇪🇸': '🇪🇸',
'🇫🇷': '🇫🇷',
'🇩🇪': '🇩🇪',
'🇯🇵': '🇯🇵',
'🇰🇷': '🇰🇷',
'🇸🇦': '🇸🇦',
'🇵🇹': '🇵🇹',
'🌎': '🌍',
'📢': '📢',
'🌠': '🌍',
'📠': '📄',
'■': '▾',
'▾': '▾',
'â–?': '▾',
'â–': '▾',
'Français': 'Français',
'Español': 'Español',
'Português': 'Português',
'日本誠': '日本語',
'ÕœêµÂ얠': '한국어',
'繠體ä¸Â文': '繁體中文',
'العربية': 'العربية',
'©': '©',
'•': '•',
'—': '—',
'▾': '▾',
'🇺🇸': '🇺🇸',
'ðŸ‡ðŸ‡°': '🇭🇰',
'🇪🇸': '🇪🇸',
'🇫🇷': '🇫🇷',
'🇩🇪': '🇩🇪',
'🇯🇵': '🇯🇵',
'🇰🇷': '🇰🇷',
'🇸🇦': '🇸🇦',
'🇵🇹': '🇵🇹'
}
def audit_file():
with open(filepath, 'rb') as f:
data = f.read()
# Try decoding to find the current mess
try:
content = data.decode('utf-8')
except UnicodeDecodeError:
content = data.decode('latin-1')
# 1. Global Replacement of known artifacts
for old, new in GLOBAL_REPLACEMENTS.items():
content = content.replace(old, new)
# 2. Fix Academic Keywords and other script-level corruptions
content = content.replace('å¦你', '学位').replace('大å¦', '大学')
content = content.replace('戠绩å ?', '成绩单').replace('诠书', '证书')
content = content.replace('造堇', '造假')
# 3. Structural Health Check
# Ensure all main sections are present and clean
if '<meta charset="UTF-8">' not in content:
content = content.replace('<head>', '<head>\n <meta charset="UTF-8">')
# Fix the trailing garbage seen in the screenshot (?/div>)
content = re.sub(r'\?\s*/div>', '</div>', content)
content = re.sub(r'\?\s*/button>', '</button>', content)
# Check for unclosed divs (rudimentary check)
open_divs = content.count('<div')
close_divs = content.count('</div')
print(f'Div count check: Open={open_divs}, Close={close_divs}')
if open_divs > close_divs:
print('WARNING: Unclosed divs detected. Attempting to balance...')
# This is risky but often it's just one missing at the end of a section
# 4. Standardize all emojis
content = content.replace('🔍', '🔍').replace('🛡️', '🛡️').replace('🌐', '🌐')
# Write back clean
with open(filepath, 'w', encoding='utf-8', newline='\n') as f:
f.write(content)
print('Deep Audit and Cleanup Complete.')
if __name__ == '__main__':
audit_file()
|