SuperBPE: Space Travel for Language Models

curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
python3 -m venv .venv
source .venv/bin/activate
pip install -r requirements.txt


# head -n 300000 /mnt/hdfs/user/linzheng/data/opencoder/chunk.1.jsonl > subsample_opencoder.jsonl
# head -n 350000 /mnt/hdfs/user/linzheng/data/opencoder/chunk.1.jsonl | tail -n 50000 > eval_opencoder.jsonl

# head -n 500000 /mnt/hdfs/user/linzheng/data/opencoder_python/opencoder_python.chunk.1.jsonl > subsample_python.jsonl
# head -n 550000 /mnt/hdfs/user/linzheng/data/opencoder_python/opencoder_python.chunk.1.jsonl | tail -n 50000 > eval_python.jsonl

bash scripts/train_tokenizer.sh python500k 400000 /opt/tiger/byte-lingua/superbpe/subsample_python.jsonl 1
bash scripts/train_tokenizer.sh python500k 400000 /opt/tiger/byte-lingua/superbpe/subsample_python.jsonl 2 tokenizer_json/python500k_vocab400K_stage1 250000
python scripts/evaluate_compression_rate.py --eval_data_path eval_python.jsonl --tokenizer_path /opt/tiger/byte-lingua/superbpe/tokenizer_json/python500k_vocab400K_stage1/tokenizer.json
python scripts/evaluate_compression_rate.py --eval_data_path eval_python.jsonl --tokenizer_path /opt/tiger/byte-lingua/superbpe/tokenizer_json/python500k_vocab400K_from350K_stage2/tokenizer.json