supanthadey1
/

bertose-affinose-training-code

reproducibility

Model card Files Files and versions

bertose-affinose-training-code / code /tokenizer /generate_bpe_ambiguity.py

supanthadey1's picture

Add BERTose and AFFINose training code release

1d6f391 verified about 2 months ago

History Blame Contribute Delete

1.39 kB

	import json
	import re

	print("Generating BPE ambiguity tokens...")

	# Load new BPE vocabulary
	with open('data/bpe_vocabulary_clean.json', 'r') as f:
	vocab = json.load(f)

	# Old ambiguity tokens (for reference)
	ambiguous_chars = {'?', 'x', 'X', 'u', 'U', 'd', 'D', 'o', 'O'}
	# Note: Some of these might be valid standard chars in WURCS (like 'u' unknown anomer),
	# but we want to know ANY token that contains uncertainty.

	ambiguous_tokens = {}
	ambiguous_ids = []

	for token, token_id in vocab['token_to_id'].items():
	if token.startswith('['): continue # Skip special tokens

	# Check if token contains any ambiguous marker
	# '?' is the main one. WURCS uses '?' for unknown linkage/anomer.
	if '?' in token or 'unknown' in token.lower():
	ambiguous_tokens[token] = token_id
	ambiguous_ids.append(token_id)
	# Also check for 'X' or 'x' if they are used as unknowns in this specific dataset context
	# (WURCS uses 'x' for unknown anomer sometimes, 'X' for unknown residue?)

	print(f"Found {len(ambiguous_tokens)} ambiguous tokens in BPE vocab.")

	# Save
	output_data = {
	"ambiguous_tokens": ambiguous_tokens,
	"ambiguous_ids": ambiguous_ids,
	"source_vocab": "data/bpe_vocabulary_clean.json"
	}

	with open('data/bpe_ambiguity_tokens.json', 'w') as f:
	json.dump(output_data, f, indent=2)

	print(f"Saved to data/bpe_ambiguity_tokens.json")