| import sentencepiece as spm |
| from transformers import T5Tokenizer |
|
|
| |
| corpus_files = ["dna_4g.txt", "eng_4g.txt","protein_4g.txt"] |
|
|
| |
| total_sentences = 0 |
| for file in corpus_files: |
| with open(file, 'r', encoding='utf-8') as f: |
| total_sentences += sum(1 for _ in f) |
| print(f"Total sentences: {total_sentences}") |
|
|
| half_sentences = total_sentences // 3 |
|
|
| |
| spm.SentencePieceTrainer.train( |
| input=corpus_files, |
| model_prefix="spm_gene_eng", |
| model_type="unigram", |
| vocab_size=90000, |
| pad_id=0, |
| bos_id=-1, |
| eos_id=1, |
| unk_id=2, |
| user_defined_symbols=",".join([f"<extra_id_{i}>" for i in range(99, -1, -1)]), |
| input_sentence_size=half_sentences, |
| shuffle_input_sentence=True, |
| character_coverage=1.0, |
| train_extremely_large_corpus=True, |
| num_threads=64, |
| ) |
|
|
| |
| trained_tokenizer = T5Tokenizer("spm_gene_eng.model") |
| print(f"Vocabulary size: {trained_tokenizer.vocab_size}") |
|
|
| |
| trained_tokenizer.save_pretrained("trained_t5_gene_eng_tokenizer") |
|
|
| text = "TGGATAACATACGGTATAAGGTTTTGATCACTATAGTTTTGTAATATAGCTTGAAATTAAGAAGTGTGATGCCTCCAGGCTTGTTCT" |
| print(trained_tokenizer.tokenize(text)) |