2ira
/

Byte-lingua-code

Model card Files Files and versions

Byte-lingua-code / superbpe /tokenizers_superbpe /tokenizers /examples /unstable_wasm /src /lib.rs

2ira's picture

offline_compression_graph_code

72c0672 verified 5 months ago

history blame contribute delete

1.11 kB

	mod utils;
	use tokenizers::models::bpe::{Vocab, BPE};
	use tokenizers::Tokenizer;

	use wasm_bindgen::prelude::*;

	// When the `wee_alloc` feature is enabled, use `wee_alloc` as the global
	// allocator.
	#[cfg(feature = "wee_alloc")]
	#[global_allocator]
	static ALLOC: wee_alloc::WeeAlloc = wee_alloc::WeeAlloc::INIT;

	#[wasm_bindgen]
	pub fn tokenize(string: &str) -> Vec<u32> {
	let vocab: Vocab = vec![
	("a".to_string(), 0),
	("##b".to_string(), 1),
	("##c".to_string(), 2),
	("ab".to_string(), 3),
	("abc".to_string(), 4),
	]
	.into_iter()
	.collect();

	let merges = vec![
	("a".to_string(), "##b".to_string()),
	("ab".to_string(), "##c".to_string()),
	];

	let bpe = BPE::builder()
	.vocab_and_merges(vocab, merges)
	.unk_token("[UNK]".to_string())
	.continuing_subword_prefix("##".to_string())
	.build()
	.unwrap();
	let tokenizer = Tokenizer::new(bpe);
	tokenizer
	.encode(string, false)
	.unwrap()
	.get_ids()
	.into_iter()
	.cloned()
	.collect()
	}