import json
import re

print("Generating BPE ambiguity tokens...")

# Load new BPE vocabulary
with open('data/bpe_vocabulary_clean.json', 'r') as f:
    vocab = json.load(f)

# Old ambiguity tokens (for reference)
ambiguous_chars = {'?', 'x', 'X', 'u', 'U', 'd', 'D', 'o', 'O'} 
# Note: Some of these might be valid standard chars in WURCS (like 'u' unknown anomer), 
# but we want to know ANY token that contains uncertainty.

ambiguous_tokens = {}
ambiguous_ids = []

for token, token_id in vocab['token_to_id'].items():
    if token.startswith('['): continue # Skip special tokens
    
    # Check if token contains any ambiguous marker
    # '?' is the main one. WURCS uses '?' for unknown linkage/anomer.
    if '?' in token or 'unknown' in token.lower():
        ambiguous_tokens[token] = token_id
        ambiguous_ids.append(token_id)
    # Also check for 'X' or 'x' if they are used as unknowns in this specific dataset context
    # (WURCS uses 'x' for unknown anomer sometimes, 'X' for unknown residue?)

print(f"Found {len(ambiguous_tokens)} ambiguous tokens in BPE vocab.")

# Save
output_data = {
    "ambiguous_tokens": ambiguous_tokens,
    "ambiguous_ids": ambiguous_ids,
    "source_vocab": "data/bpe_vocabulary_clean.json"
}

with open('data/bpe_ambiguity_tokens.json', 'w') as f:
    json.dump(output_data, f, indent=2)

print(f"Saved to data/bpe_ambiguity_tokens.json")