File size: 1,294 Bytes
72c0672
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
bash scripts/train_tokenizer.sh python500k 800000 /opt/tiger/byte-lingua/superbpe/subsample_python.jsonl 2 tokenizer_json/python500k_vocab800K_stage1 80000
bash scripts/train_tokenizer.sh python500k 800000 /opt/tiger/byte-lingua/superbpe/subsample_python.jsonl 2 tokenizer_json/python500k_vocab800K_stage1 320000
bash scripts/train_tokenizer.sh python500k 800000 /opt/tiger/byte-lingua/superbpe/subsample_python.jsonl 2 tokenizer_json/python500k_vocab800K_stage1 480000
bash scripts/train_tokenizer.sh python500k 800000 /opt/tiger/byte-lingua/superbpe/subsample_python.jsonl 2 tokenizer_json/python500k_vocab800K_stage1 640000


bash scripts/train_tokenizer.sh opencoder300k 800000 /opt/tiger/byte-lingua/superbpe/subsample_opencoder.jsonl 2 tokenizer_json/opencoder300k_vocab800K_stage1 80000
bash scripts/train_tokenizer.sh opencoder300k 800000 /opt/tiger/byte-lingua/superbpe/subsample_opencoder.jsonl 2 tokenizer_json/opencoder300k_vocab800K_stage1 160000
bash scripts/train_tokenizer.sh opencoder300k 800000 /opt/tiger/byte-lingua/superbpe/subsample_opencoder.jsonl 2 tokenizer_json/opencoder300k_vocab800K_stage1 480000
bash scripts/train_tokenizer.sh opencoder300k 800000 /opt/tiger/byte-lingua/superbpe/subsample_opencoder.jsonl 2 tokenizer_json/opencoder300k_vocab800K_stage1 640000