| import json |
| import re |
|
|
| print("Generating BPE ambiguity tokens...") |
|
|
| |
| with open('data/bpe_vocabulary_clean.json', 'r') as f: |
| vocab = json.load(f) |
|
|
| |
| ambiguous_chars = {'?', 'x', 'X', 'u', 'U', 'd', 'D', 'o', 'O'} |
| |
| |
|
|
| ambiguous_tokens = {} |
| ambiguous_ids = [] |
|
|
| for token, token_id in vocab['token_to_id'].items(): |
| if token.startswith('['): continue |
| |
| |
| |
| if '?' in token or 'unknown' in token.lower(): |
| ambiguous_tokens[token] = token_id |
| ambiguous_ids.append(token_id) |
| |
| |
|
|
| print(f"Found {len(ambiguous_tokens)} ambiguous tokens in BPE vocab.") |
|
|
| |
| output_data = { |
| "ambiguous_tokens": ambiguous_tokens, |
| "ambiguous_ids": ambiguous_ids, |
| "source_vocab": "data/bpe_vocabulary_clean.json" |
| } |
|
|
| with open('data/bpe_ambiguity_tokens.json', 'w') as f: |
| json.dump(output_data, f, indent=2) |
|
|
| print(f"Saved to data/bpe_ambiguity_tokens.json") |
|
|