levossadtchi commited on
Commit
af047e0
·
verified ·
1 Parent(s): 7e3dd98

Upload folder using huggingface_hub

Browse files
data/sft/.DS_Store ADDED
Binary file (6.15 kB). View file
 
data/sft/processed/.DS_Store ADDED
Binary file (6.15 kB). View file
 
data/sft/processed/dataset_summary.json ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "val_examples": 2000,
4
+ "max_train_examples": 200000,
5
+ "min_supervised_tokens": 16,
6
+ "shuffle": true,
7
+ "format": "messages",
8
+ "messages_field": "messages",
9
+ "sources": [
10
+ {
11
+ "source_name": "smol_magpie_ultra",
12
+ "path": "HuggingFaceTB/smoltalk",
13
+ "config_name": "smol-magpie-ultra",
14
+ "split": "train",
15
+ "weight": 0.4,
16
+ "row_filters": {
17
+ "quality": "good"
18
+ }
19
+ },
20
+ {
21
+ "source_name": "openhermes",
22
+ "path": "HuggingFaceTB/smoltalk",
23
+ "config_name": "openhermes-100k",
24
+ "split": "train",
25
+ "weight": 0.15
26
+ },
27
+ {
28
+ "source_name": "self_oss_instruct",
29
+ "path": "HuggingFaceTB/smoltalk",
30
+ "config_name": "self-oss-instruct",
31
+ "split": "train",
32
+ "weight": 0.15
33
+ },
34
+ {
35
+ "source_name": "everyday_conversations",
36
+ "path": "HuggingFaceTB/smoltalk",
37
+ "config_name": "everyday-conversations",
38
+ "split": "train",
39
+ "weight": 0.01
40
+ },
41
+ {
42
+ "source_name": "numina_cot",
43
+ "path": "HuggingFaceTB/smoltalk",
44
+ "config_name": "numina-cot-100k",
45
+ "split": "train",
46
+ "weight": 0.1
47
+ },
48
+ {
49
+ "source_name": "metamathqa",
50
+ "path": "HuggingFaceTB/smoltalk",
51
+ "config_name": "metamathqa-50k",
52
+ "split": "train",
53
+ "weight": 0.05
54
+ },
55
+ {
56
+ "source_name": "longalign",
57
+ "path": "HuggingFaceTB/smoltalk",
58
+ "config_name": "longalign",
59
+ "split": "train",
60
+ "weight": 0.015
61
+ },
62
+ {
63
+ "source_name": "ultrachat_200k",
64
+ "path": "HuggingFaceH4/ultrachat_200k",
65
+ "config_name": null,
66
+ "split": "train_sft",
67
+ "weight": 0.125
68
+ }
69
+ ]
70
+ },
71
+ "sources": [
72
+ {
73
+ "name": "smol_magpie_ultra",
74
+ "path": "HuggingFaceTB/smoltalk",
75
+ "config_name": "smol-magpie-ultra",
76
+ "weight": 0.4,
77
+ "train_target": 80000,
78
+ "val_target": 800,
79
+ "train_examples": 80000,
80
+ "val_examples": 800,
81
+ "rows_seen": 117281,
82
+ "skipped_rows": 36481
83
+ },
84
+ {
85
+ "name": "openhermes",
86
+ "path": "HuggingFaceTB/smoltalk",
87
+ "config_name": "openhermes-100k",
88
+ "weight": 0.15,
89
+ "train_target": 30000,
90
+ "val_target": 300,
91
+ "train_examples": 30000,
92
+ "val_examples": 300,
93
+ "rows_seen": 31945,
94
+ "skipped_rows": 1645
95
+ },
96
+ {
97
+ "name": "self_oss_instruct",
98
+ "path": "HuggingFaceTB/smoltalk",
99
+ "config_name": "self-oss-instruct",
100
+ "weight": 0.15,
101
+ "train_target": 30000,
102
+ "val_target": 300,
103
+ "train_examples": 30000,
104
+ "val_examples": 300,
105
+ "rows_seen": 30300,
106
+ "skipped_rows": 0
107
+ },
108
+ {
109
+ "name": "everyday_conversations",
110
+ "path": "HuggingFaceTB/smoltalk",
111
+ "config_name": "everyday-conversations",
112
+ "weight": 0.01,
113
+ "train_target": 2000,
114
+ "val_target": 20,
115
+ "train_examples": 2000,
116
+ "val_examples": 20,
117
+ "rows_seen": 2020,
118
+ "skipped_rows": 0
119
+ },
120
+ {
121
+ "name": "numina_cot",
122
+ "path": "HuggingFaceTB/smoltalk",
123
+ "config_name": "numina-cot-100k",
124
+ "weight": 0.1,
125
+ "train_target": 20000,
126
+ "val_target": 200,
127
+ "train_examples": 20000,
128
+ "val_examples": 200,
129
+ "rows_seen": 20200,
130
+ "skipped_rows": 0
131
+ },
132
+ {
133
+ "name": "metamathqa",
134
+ "path": "HuggingFaceTB/smoltalk",
135
+ "config_name": "metamathqa-50k",
136
+ "weight": 0.05,
137
+ "train_target": 10000,
138
+ "val_target": 100,
139
+ "train_examples": 10000,
140
+ "val_examples": 100,
141
+ "rows_seen": 10104,
142
+ "skipped_rows": 4
143
+ },
144
+ {
145
+ "name": "longalign",
146
+ "path": "HuggingFaceTB/smoltalk",
147
+ "config_name": "longalign",
148
+ "weight": 0.015,
149
+ "train_target": 3000,
150
+ "val_target": 30,
151
+ "train_examples": 3000,
152
+ "val_examples": 30,
153
+ "rows_seen": 3030,
154
+ "skipped_rows": 0
155
+ },
156
+ {
157
+ "name": "ultrachat_200k",
158
+ "path": "HuggingFaceH4/ultrachat_200k",
159
+ "config_name": null,
160
+ "weight": 0.125,
161
+ "train_target": 25000,
162
+ "val_target": 250,
163
+ "train_examples": 25000,
164
+ "val_examples": 250,
165
+ "rows_seen": 25250,
166
+ "skipped_rows": 0
167
+ }
168
+ ],
169
+ "tokenizer_meta": {
170
+ "vocab_size": 49152,
171
+ "special_tokens": {
172
+ "pad_token": "<pad>",
173
+ "bos_token": "<bos>",
174
+ "eos_token": "<eos>",
175
+ "unk_token": "<unk>",
176
+ "pad_token_id": 0,
177
+ "bos_token_id": 1,
178
+ "eos_token_id": 2,
179
+ "unk_token_id": 3
180
+ },
181
+ "data_config": {
182
+ "sources": [
183
+ {
184
+ "name": "fineweb_edu",
185
+ "path": "HuggingFaceFW/fineweb-edu",
186
+ "split": "train",
187
+ "weight": 0.6,
188
+ "text_field": "text",
189
+ "config_name": "sample-10BT",
190
+ "data_dir": null,
191
+ "revision": null,
192
+ "streaming": true,
193
+ "shuffle_buffer": 10000,
194
+ "sample_documents": null
195
+ },
196
+ {
197
+ "name": "cosmopedia_v2",
198
+ "path": "HuggingFaceTB/smollm-corpus",
199
+ "split": "train",
200
+ "weight": 0.2,
201
+ "text_field": "text",
202
+ "config_name": "cosmopedia-v2",
203
+ "data_dir": null,
204
+ "revision": null,
205
+ "streaming": true,
206
+ "shuffle_buffer": 10000,
207
+ "sample_documents": null
208
+ },
209
+ {
210
+ "name": "the_stack_python",
211
+ "path": "bigcode/the-stack-dedup",
212
+ "split": "train",
213
+ "weight": 0.1,
214
+ "text_field": "content",
215
+ "config_name": null,
216
+ "data_dir": "data/python",
217
+ "revision": null,
218
+ "streaming": true,
219
+ "shuffle_buffer": 2000,
220
+ "sample_documents": null
221
+ },
222
+ {
223
+ "name": "finemath",
224
+ "path": "HuggingFaceTB/finemath",
225
+ "split": "train",
226
+ "weight": 0.1,
227
+ "text_field": "text",
228
+ "config_name": "finemath-4plus",
229
+ "data_dir": null,
230
+ "revision": null,
231
+ "streaming": true,
232
+ "shuffle_buffer": 5000,
233
+ "sample_documents": null
234
+ }
235
+ ],
236
+ "tokenizer_sample_documents": 2000000,
237
+ "tokenizer_min_frequency": 2,
238
+ "tokenizer_special_tokens": [
239
+ "<pad>",
240
+ "<bos>",
241
+ "<eos>",
242
+ "<unk>"
243
+ ],
244
+ "train_tokens": 10000000000,
245
+ "val_tokens": 20000000,
246
+ "shard_size_tokens": 100000000
247
+ }
248
+ },
249
+ "train": {
250
+ "num_examples": 200000,
251
+ "seq_len": 2048,
252
+ "input_ids_path": "train_input_ids.bin",
253
+ "labels_path": "train_labels.bin"
254
+ },
255
+ "val": {
256
+ "num_examples": 2000,
257
+ "seq_len": 2048,
258
+ "input_ids_path": "val_input_ids.bin",
259
+ "labels_path": "val_labels.bin"
260
+ }
261
+ }
data/sft/processed/logs/prepare_sft_data_20260315_132126.log ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-03-15 13:21:26,604 | INFO | SFT data preparation started
2
+ 2026-03-15 13:21:26,604 | INFO | Log file: data/sft/processed/logs/prepare_sft_data_20260315_132126.log
3
+ 2026-03-15 13:21:26,605 | INFO | Arguments | config=configs/sft_data_smoltalk.json tokenizer_dir=data/tokenizer output_dir=data/sft/processed seq_len=2048 seed=42
4
+ 2026-03-15 13:21:26,605 | INFO | SFT mixture config | num_sources=8 val_examples=2000 max_train_examples=200000
5
+ 2026-03-15 13:21:26,605 | INFO | SFT packing config | seq_len=2048 min_supervised_tokens=16
6
+ 2026-03-15 13:21:26,605 | INFO | SFT source[0] | name=smol_magpie_ultra path=HuggingFaceTB/smoltalk config_name=smol-magpie-ultra split=train format=messages streaming=False weight=0.4 row_filters={'quality': 'good'} val_target=800 train_target=80000
7
+ 2026-03-15 13:21:26,605 | INFO | SFT source[1] | name=openhermes path=HuggingFaceTB/smoltalk config_name=openhermes-100k split=train format=messages streaming=False weight=0.15 row_filters=None val_target=300 train_target=30000
8
+ 2026-03-15 13:21:26,605 | INFO | SFT source[2] | name=self_oss_instruct path=HuggingFaceTB/smoltalk config_name=self-oss-instruct split=train format=messages streaming=False weight=0.15 row_filters=None val_target=300 train_target=30000
9
+ 2026-03-15 13:21:26,605 | INFO | SFT source[3] | name=everyday_conversations path=HuggingFaceTB/smoltalk config_name=everyday-conversations split=train format=messages streaming=False weight=0.01 row_filters=None val_target=20 train_target=2000
10
+ 2026-03-15 13:21:26,605 | INFO | SFT source[4] | name=numina_cot path=HuggingFaceTB/smoltalk config_name=numina-cot-100k split=train format=messages streaming=False weight=0.1 row_filters=None val_target=200 train_target=20000
11
+ 2026-03-15 13:21:26,605 | INFO | SFT source[5] | name=metamathqa path=HuggingFaceTB/smoltalk config_name=metamathqa-50k split=train format=messages streaming=False weight=0.05 row_filters=None val_target=100 train_target=10000
12
+ 2026-03-15 13:21:26,605 | INFO | SFT source[6] | name=longalign path=HuggingFaceTB/smoltalk config_name=longalign split=train format=messages streaming=False weight=0.015 row_filters=None val_target=30 train_target=3000
13
+ 2026-03-15 13:21:26,605 | INFO | SFT source[7] | name=ultrachat_200k path=HuggingFaceH4/ultrachat_200k config_name=None split=train_sft format=messages streaming=False weight=0.125 row_filters=None val_target=250 train_target=25000
14
+ 2026-03-15 13:21:26,605 | INFO | Tokenizer special ids | bos=1 eos=2 pad=0
15
+ 2026-03-15 13:21:26,606 | INFO | Loading SFT source | name=smol_magpie_ultra
16
+ 2026-03-15 13:21:49,343 | INFO | SFT progress | processed=5,000 train_examples=4,200 val_examples=800 skipped=2,212
17
+ 2026-03-15 13:22:07,970 | INFO | SFT progress | processed=10,000 train_examples=9,200 val_examples=800 skipped=4,536
18
+ 2026-03-15 13:22:26,634 | INFO | SFT progress | processed=15,000 train_examples=14,200 val_examples=800 skipped=6,798
19
+ 2026-03-15 13:22:44,959 | INFO | SFT progress | processed=20,000 train_examples=19,200 val_examples=800 skipped=9,047
20
+ 2026-03-15 13:23:03,316 | INFO | SFT progress | processed=25,000 train_examples=24,200 val_examples=800 skipped=11,398
21
+ 2026-03-15 13:23:21,705 | INFO | SFT progress | processed=30,000 train_examples=29,200 val_examples=800 skipped=13,716
22
+ 2026-03-15 13:23:39,935 | INFO | SFT progress | processed=35,000 train_examples=34,200 val_examples=800 skipped=15,985
23
+ 2026-03-15 13:23:58,367 | INFO | SFT progress | processed=40,000 train_examples=39,200 val_examples=800 skipped=18,284
24
+ 2026-03-15 13:24:16,745 | INFO | SFT progress | processed=45,000 train_examples=44,200 val_examples=800 skipped=20,512
25
+ 2026-03-15 13:24:35,169 | INFO | SFT progress | processed=50,000 train_examples=49,200 val_examples=800 skipped=22,749
26
+ 2026-03-15 13:24:53,377 | INFO | SFT progress | processed=55,000 train_examples=54,200 val_examples=800 skipped=24,949
27
+ 2026-03-15 13:25:11,868 | INFO | SFT progress | processed=60,000 train_examples=59,200 val_examples=800 skipped=27,188
28
+ 2026-03-15 13:25:30,314 | INFO | SFT progress | processed=65,000 train_examples=64,200 val_examples=800 skipped=29,431
29
+ 2026-03-15 13:25:48,714 | INFO | SFT progress | processed=70,000 train_examples=69,200 val_examples=800 skipped=31,716
30
+ 2026-03-15 13:26:07,119 | INFO | SFT progress | processed=75,000 train_examples=74,200 val_examples=800 skipped=33,870
31
+ 2026-03-15 13:26:25,775 | INFO | SFT progress | processed=80,000 train_examples=79,200 val_examples=800 skipped=36,145
32
+ 2026-03-15 13:26:28,721 | INFO | Completed SFT source | name=smol_magpie_ultra train=80,000/80000 val=800/800 seen=117,281 skipped=36,481
33
+ 2026-03-15 13:26:28,721 | INFO | Loading SFT source | name=openhermes
34
+ 2026-03-15 13:26:36,651 | INFO | SFT progress | processed=85,000 train_examples=83,900 val_examples=1,100 skipped=36,707
35
+ 2026-03-15 13:26:42,553 | INFO | SFT progress | processed=90,000 train_examples=88,900 val_examples=1,100 skipped=36,961
36
+ 2026-03-15 13:26:48,344 | INFO | SFT progress | processed=95,000 train_examples=93,900 val_examples=1,100 skipped=37,227
37
+ 2026-03-15 13:26:54,249 | INFO | SFT progress | processed=100,000 train_examples=98,900 val_examples=1,100 skipped=37,516
38
+ 2026-03-15 13:27:00,205 | INFO | SFT progress | processed=105,000 train_examples=103,900 val_examples=1,100 skipped=37,782
39
+ 2026-03-15 13:27:06,261 | INFO | SFT progress | processed=110,000 train_examples=108,900 val_examples=1,100 skipped=38,065
40
+ 2026-03-15 13:27:07,568 | INFO | Completed SFT source | name=openhermes train=30,000/30000 val=300/300 seen=31,945 skipped=1,645
41
+ 2026-03-15 13:27:07,568 | INFO | Loading SFT source | name=self_oss_instruct
42
+ 2026-03-15 13:27:17,619 | INFO | SFT progress | processed=115,000 train_examples=113,600 val_examples=1,400 skipped=38,126
43
+ 2026-03-15 13:27:22,498 | INFO | SFT progress | processed=120,000 train_examples=118,600 val_examples=1,400 skipped=38,126
44
+ 2026-03-15 13:27:27,485 | INFO | SFT progress | processed=125,000 train_examples=123,600 val_examples=1,400 skipped=38,126
45
+ 2026-03-15 13:27:32,482 | INFO | SFT progress | processed=130,000 train_examples=128,600 val_examples=1,400 skipped=38,126
46
+ 2026-03-15 13:27:37,473 | INFO | SFT progress | processed=135,000 train_examples=133,600 val_examples=1,400 skipped=38,126
47
+ 2026-03-15 13:27:42,522 | INFO | SFT progress | processed=140,000 train_examples=138,600 val_examples=1,400 skipped=38,126
48
+ 2026-03-15 13:27:43,916 | INFO | Completed SFT source | name=self_oss_instruct train=30,000/30000 val=300/300 seen=30,300 skipped=0
49
+ 2026-03-15 13:27:43,916 | INFO | Loading SFT source | name=everyday_conversations
50
+ 2026-03-15 13:27:49,524 | INFO | Completed SFT source | name=everyday_conversations train=2,000/2000 val=20/20 seen=2,020 skipped=0
51
+ 2026-03-15 13:27:49,525 | INFO | Loading SFT source | name=numina_cot
52
+ 2026-03-15 13:27:56,930 | INFO | SFT progress | processed=145,000 train_examples=143,380 val_examples=1,620 skipped=38,126
53
+ 2026-03-15 13:28:03,530 | INFO | SFT progress | processed=150,000 train_examples=148,380 val_examples=1,620 skipped=38,126
54
+ 2026-03-15 13:28:09,916 | INFO | SFT progress | processed=155,000 train_examples=153,380 val_examples=1,620 skipped=38,126
55
+ 2026-03-15 13:28:16,444 | INFO | SFT progress | processed=160,000 train_examples=158,380 val_examples=1,620 skipped=38,126
56
+ 2026-03-15 13:28:21,164 | INFO | Completed SFT source | name=numina_cot train=20,000/20000 val=200/200 seen=20,200 skipped=0
57
+ 2026-03-15 13:28:21,165 | INFO | Loading SFT source | name=metamathqa
58
+ 2026-03-15 13:28:26,153 | INFO | SFT progress | processed=165,000 train_examples=163,280 val_examples=1,720 skipped=38,126
59
+ 2026-03-15 13:28:29,853 | INFO | SFT progress | processed=170,000 train_examples=168,280 val_examples=1,720 skipped=38,127
60
+ 2026-03-15 13:28:32,549 | INFO | Completed SFT source | name=metamathqa train=10,000/10000 val=100/100 seen=10,104 skipped=4
61
+ 2026-03-15 13:28:32,549 | INFO | Loading SFT source | name=longalign
62
+ 2026-03-15 13:29:03,538 | INFO | SFT progress | processed=175,000 train_examples=173,250 val_examples=1,750 skipped=38,130
63
+ 2026-03-15 13:29:42,829 | INFO | Completed SFT source | name=longalign train=3,000/3000 val=30/30 seen=3,030 skipped=0
64
+ 2026-03-15 13:29:42,830 | INFO | Loading SFT source | name=ultrachat_200k
65
+ 2026-03-15 13:29:56,989 | INFO | SFT progress | processed=180,000 train_examples=178,000 val_examples=2,000 skipped=38,130
66
+ 2026-03-15 13:30:12,911 | INFO | SFT progress | processed=185,000 train_examples=183,000 val_examples=2,000 skipped=38,130
67
+ 2026-03-15 13:30:28,635 | INFO | SFT progress | processed=190,000 train_examples=188,000 val_examples=2,000 skipped=38,130
68
+ 2026-03-15 13:30:44,882 | INFO | SFT progress | processed=195,000 train_examples=193,000 val_examples=2,000 skipped=38,130
69
+ 2026-03-15 13:31:01,202 | INFO | SFT progress | processed=200,000 train_examples=198,000 val_examples=2,000 skipped=38,130
70
+ 2026-03-15 13:31:07,611 | INFO | Completed SFT source | name=ultrachat_200k train=25,000/25000 val=250/250 seen=25,250 skipped=0
71
+ 2026-03-15 13:31:07,614 | INFO | SFT dataset saved | output_dir=data/sft/processed
72
+ 2026-03-15 13:31:07,615 | INFO | SFT summary | train_examples=200,000 val_examples=2,000 skipped_rows=38,130
73
+ 2026-03-15 13:31:07,615 | INFO | SFT metadata saved | path=data/sft/processed/dataset_summary.json
data/sft/processed/train_input_ids.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fec9583e942ce4ba4abc4cfe1f7db4e7dd74d166405b3e3570dd0858d939b2a3
3
+ size 819200000
data/sft/processed/train_labels.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2638b86f114f11c3441e27b39206fa5b7625376988792daf51676d3762651d9
3
+ size 1638400000
data/sft/processed/train_metadata.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "num_examples": 200000,
3
+ "seq_len": 2048,
4
+ "input_ids_path": "train_input_ids.bin",
5
+ "labels_path": "train_labels.bin"
6
+ }
data/sft/processed/val_input_ids.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd248a527b9b3fcd829a29780341d5958c63e25453035398beb633b65778efc6
3
+ size 8192000
data/sft/processed/val_labels.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:857795ae824b5ea4d74a551d1d3075d9bcd03f68912afa6589e5f831f66a2ad5
3
+ size 16384000
data/sft/processed/val_metadata.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "num_examples": 2000,
3
+ "seq_len": 2048,
4
+ "input_ids_path": "val_input_ids.bin",
5
+ "labels_path": "val_labels.bin"
6
+ }