tokenizer-3min-32k / metadata.json
Tralalabs's picture
Upload 3 files
bddb802 verified
{
"tokenizer_type": "ByteLevel BPE",
"vocab_size": 32000,
"target_texts": 700000,
"min_chars": 80,
"max_chars": 12000,
"datasets": [
{
"dataset": "allenai/c4",
"config": "en",
"split": "train",
"column": "text",
"share": 0.5,
"target_texts": 350000
},
{
"dataset": "HuggingFaceFW/fineweb-edu",
"config": null,
"split": "train",
"column": "text",
"share": 0.2,
"target_texts": 140000
},
{
"dataset": "wikimedia/wikipedia",
"config": "20231101.en",
"split": "train",
"column": "text",
"share": 0.1,
"target_texts": 70000
},
{
"dataset": "codeparrot/codeparrot-clean",
"config": null,
"split": "train",
"column": "content",
"share": 0.1,
"target_texts": 70000
},
{
"dataset": "allenai/c4",
"config": "es",
"split": "train",
"column": "text",
"share": 0.1,
"target_texts": 70000
}
],
"elapsed_seconds": 209.82311129570007,
"elapsed_minutes": 3.4970518549283347,
"output_dir": "/out/tokenizer-bpe-32k"
}