import os import re filepath = r'd:\aicoding\kaiyuan\v2\index.html' # Common corruptions to fix globally GLOBAL_REPLACEMENTS = { '🇺🇸': '🇺🇸', '🇭🇰': '🇭🇰', '🇪🇸': '🇪🇸', '🇫🇷': '🇫🇷', '🇩🇪': '🇩🇪', '🇯🇵': '🇯🇵', '🇰🇷': '🇰🇷', '🇸🇦': '🇸🇦', '🇵🇹': '🇵🇹', '🌎': '🌍', '📢': '📢', '🌠': '🌍', '📠': '📄', '■': '▾', '▾': '▾', 'â–?': '▾', 'â–': '▾', 'Français': 'Français', 'Español': 'Español', 'Português': 'Português', '日本誠': '日本語', '한국얠': '한국어', '繠體中文': '繁體中文', 'العربية': 'العربية', '©': '©', '•': '•', '—': '—', '▾': '▾', '🇺🇸': '🇺🇸', '🇭🇰': '🇭🇰', '🇪🇸': '🇪🇸', '🇫🇷': '🇫🇷', '🇩🇪': '🇩🇪', '🇯🇵': '🇯🇵', '🇰🇷': '🇰🇷', '🇸🇦': '🇸🇦', '🇵🇹': '🇵🇹' } def audit_file(): with open(filepath, 'rb') as f: data = f.read() # Try decoding to find the current mess try: content = data.decode('utf-8') except UnicodeDecodeError: content = data.decode('latin-1') # 1. Global Replacement of known artifacts for old, new in GLOBAL_REPLACEMENTS.items(): content = content.replace(old, new) # 2. Fix Academic Keywords and other script-level corruptions content = content.replace('学你', '学位').replace('大学', '大学') content = content.replace('戠绩å ?', '成绩单').replace('诠书', '证书') content = content.replace('造堇', '造假') # 3. Structural Health Check # Ensure all main sections are present and clean if '' not in content: content = content.replace('', '\n ') # Fix the trailing garbage seen in the screenshot (?/div>) content = re.sub(r'\?\s*/div>', '', content) content = re.sub(r'\?\s*/button>', '', content) # Check for unclosed divs (rudimentary check) open_divs = content.count(' close_divs: print('WARNING: Unclosed divs detected. Attempting to balance...') # This is risky but often it's just one missing at the end of a section # 4. Standardize all emojis content = content.replace('🔍', '🔍').replace('🛡️', '🛡️').replace('🌐', '🌐') # Write back clean with open(filepath, 'w', encoding='utf-8', newline='\n') as f: f.write(content) print('Deep Audit and Cleanup Complete.') if __name__ == '__main__': audit_file()