import json import re print("Generating BPE ambiguity tokens...") # Load new BPE vocabulary with open('data/bpe_vocabulary_clean.json', 'r') as f: vocab = json.load(f) # Old ambiguity tokens (for reference) ambiguous_chars = {'?', 'x', 'X', 'u', 'U', 'd', 'D', 'o', 'O'} # Note: Some of these might be valid standard chars in WURCS (like 'u' unknown anomer), # but we want to know ANY token that contains uncertainty. ambiguous_tokens = {} ambiguous_ids = [] for token, token_id in vocab['token_to_id'].items(): if token.startswith('['): continue # Skip special tokens # Check if token contains any ambiguous marker # '?' is the main one. WURCS uses '?' for unknown linkage/anomer. if '?' in token or 'unknown' in token.lower(): ambiguous_tokens[token] = token_id ambiguous_ids.append(token_id) # Also check for 'X' or 'x' if they are used as unknowns in this specific dataset context # (WURCS uses 'x' for unknown anomer sometimes, 'X' for unknown residue?) print(f"Found {len(ambiguous_tokens)} ambiguous tokens in BPE vocab.") # Save output_data = { "ambiguous_tokens": ambiguous_tokens, "ambiguous_ids": ambiguous_ids, "source_vocab": "data/bpe_vocabulary_clean.json" } with open('data/bpe_ambiguity_tokens.json', 'w') as f: json.dump(output_data, f, indent=2) print(f"Saved to data/bpe_ambiguity_tokens.json")