File size: 2,435 Bytes
c5f9e16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
{
  "version": "10B",
  "num_docs": 15368808,
  "num_val_docs": 50000,
  "max_train_tokens": 8000000000,
  "shuffle_seed": null,
  "shard_size": 100000000,
  "append_eos": false,
  "docs_jsonl": "docs_selected.jsonl",
  "docs_meta": {
    "remote_repo_id": "willdepueoai/parameter-golf",
    "remote_root": "datasets",
    "num_docs": 15368808,
    "docs_sha256": "84386dfa7b339a5d4831d5273c4a2028b78b60670d3a235633a8520545d19bc7",
    "source_manifest": "docs_selected.source_manifest.json",
    "source_sidecar": {
      "source_export_root": "/root/exports/fineweb_50Bsub100B_50keval_v0",
      "snapshot_kind": "partial_docs_cache_from_50B_export",
      "note": "not canonical 10B shard selection; train split is a paused snapshot of the 50B shuffled train stream",
      "selection_seed": 1337,
      "num_val_docs": 50000,
      "num_docs": 15368808,
      "docs_val": 50000,
      "docs_train": 15318808,
      "docs_bytes": 48166275520,
      "docs_sha256": "84386dfa7b339a5d4831d5273c4a2028b78b60670d3a235633a8520545d19bc7"
    }
  },
  "tokenizer_specs": [
    {
      "name": "sp_bpe_8192",
      "dataset_suffix": "sp8192",
      "vocab_size": 8192,
      "tokenizer_train_docs": 50000
    }
  ],
  "tokenizers": [
    {
      "name": "sp_bpe_8192",
      "kind": "sentencepiece_bpe",
      "vocab_size": 8192,
      "bos_id": 1,
      "eos_id": 2,
      "recommended_bigram_vocab_size": 40960,
      "source_spec": {
        "name": "sp_bpe_8192",
        "dataset_suffix": "sp8192",
        "vocab_size": 8192,
        "tokenizer_train_docs": 50000
      },
      "model_path": "tokenizers/fineweb_8192_bpe.model",
      "vocab_path": "tokenizers/fineweb_8192_bpe.vocab"
    }
  ],
  "datasets": [
    {
      "name": "fineweb10B_sp8192",
      "tokenizer_name": "sp_bpe_8192",
      "tokenizer_kind": "sentencepiece_bpe",
      "path": "datasets/fineweb10B_sp8192",
      "train_glob": "datasets/fineweb10B_sp8192/fineweb_train_*.bin",
      "val_glob": "datasets/fineweb10B_sp8192/fineweb_val_*.bin",
      "vocab_size": 8192,
      "bos_id": 1,
      "eos_id": 2,
      "recommended_bigram_vocab_size": 40960,
      "stats": {
        "docs_total": 9682720,
        "docs_val": 50000,
        "docs_train": 9632720,
        "files_total": 81,
        "files_val": 1,
        "files_train": 80,
        "tokens_total": 8040547886,
        "tokens_val": 40547886,
        "tokens_train": 8000000000
      }
    }
  ]
}