| # SuperBPE: Space Travel for Language Models | |
| ```bash | |
| curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh | |
| python3 -m venv .venv | |
| source .venv/bin/activate | |
| pip install -r requirements.txt | |
| # head -n 300000 /mnt/hdfs/user/linzheng/data/opencoder/chunk.1.jsonl > subsample_opencoder.jsonl | |
| # head -n 350000 /mnt/hdfs/user/linzheng/data/opencoder/chunk.1.jsonl | tail -n 50000 > eval_opencoder.jsonl | |
| # head -n 500000 /mnt/hdfs/user/linzheng/data/opencoder_python/opencoder_python.chunk.1.jsonl > subsample_python.jsonl | |
| # head -n 550000 /mnt/hdfs/user/linzheng/data/opencoder_python/opencoder_python.chunk.1.jsonl | tail -n 50000 > eval_python.jsonl | |
| bash scripts/train_tokenizer.sh python500k 400000 /opt/tiger/byte-lingua/superbpe/subsample_python.jsonl 1 | |
| bash scripts/train_tokenizer.sh python500k 400000 /opt/tiger/byte-lingua/superbpe/subsample_python.jsonl 2 tokenizer_json/python500k_vocab400K_stage1 250000 | |
| python scripts/evaluate_compression_rate.py --eval_data_path eval_python.jsonl --tokenizer_path /opt/tiger/byte-lingua/superbpe/tokenizer_json/python500k_vocab400K_stage1/tokenizer.json | |
| python scripts/evaluate_compression_rate.py --eval_data_path eval_python.jsonl --tokenizer_path /opt/tiger/byte-lingua/superbpe/tokenizer_json/python500k_vocab400K_from350K_stage2/tokenizer.json | |
| ``` | |