bertose-affinose-training-code / code /tokenizer /generate_bpe_ambiguity.py
supanthadey1's picture
Add BERTose and AFFINose training code release
1d6f391 verified
Raw
History Blame Contribute Delete
1.39 kB
import json
import re
print("Generating BPE ambiguity tokens...")
# Load new BPE vocabulary
with open('data/bpe_vocabulary_clean.json', 'r') as f:
vocab = json.load(f)
# Old ambiguity tokens (for reference)
ambiguous_chars = {'?', 'x', 'X', 'u', 'U', 'd', 'D', 'o', 'O'}
# Note: Some of these might be valid standard chars in WURCS (like 'u' unknown anomer),
# but we want to know ANY token that contains uncertainty.
ambiguous_tokens = {}
ambiguous_ids = []
for token, token_id in vocab['token_to_id'].items():
if token.startswith('['): continue # Skip special tokens
# Check if token contains any ambiguous marker
# '?' is the main one. WURCS uses '?' for unknown linkage/anomer.
if '?' in token or 'unknown' in token.lower():
ambiguous_tokens[token] = token_id
ambiguous_ids.append(token_id)
# Also check for 'X' or 'x' if they are used as unknowns in this specific dataset context
# (WURCS uses 'x' for unknown anomer sometimes, 'X' for unknown residue?)
print(f"Found {len(ambiguous_tokens)} ambiguous tokens in BPE vocab.")
# Save
output_data = {
"ambiguous_tokens": ambiguous_tokens,
"ambiguous_ids": ambiguous_ids,
"source_vocab": "data/bpe_vocabulary_clean.json"
}
with open('data/bpe_ambiguity_tokens.json', 'w') as f:
json.dump(output_data, f, indent=2)
print(f"Saved to data/bpe_ambiguity_tokens.json")