szxllm commited on
Commit
6d0972d
·
verified ·
1 Parent(s): 71c3881

Update data_config.py

Browse files
Files changed (1) hide show
  1. data_config.py +429 -291
data_config.py CHANGED
@@ -1,292 +1,430 @@
1
- # data_config.py
2
- """
3
- 预训练和后训练数据集配置
4
- """
5
-
6
- PRETRAIN_DATASETS = {
7
- # 文本数据集
8
- 'the_pile': {
9
- 'type': 'text',
10
- 'hf_path': 'EleutherAI/pile',
11
- 'split': 'train',
12
- 'streaming': True,
13
- 'text_field': 'text',
14
- 'weight': 1.0,
15
- 'description': 'The Pile - 825GB diverse text corpus'
16
- },
17
- 'c4': {
18
- 'type': 'text',
19
- 'hf_path': 'allenai/c4',
20
- 'config': 'en',
21
- 'split': 'train',
22
- 'streaming': True,
23
- 'text_field': 'text',
24
- 'weight': 0.5,
25
- 'description': 'C4 - Colossal Clean Crawled Corpus'
26
- },
27
- 'wikipedia': {
28
- 'type': 'text',
29
- 'hf_path': 'HuggingFaceFW/fineweb-edu',
30
- 'config': 'sample-10BT',
31
- 'split': 'train',
32
- 'streaming': True,
33
- 'text_field': 'text',
34
- 'weight': 0.3,
35
- 'description': 'FineWeb Edu - High quality educational content'
36
- },
37
- 'bookcorpus': {
38
- 'type': 'text',
39
- 'hf_path': 'HuggingFaceTB/smollm-corpus',
40
- 'config': 'cosmopedia-v2',
41
- 'split': 'train',
42
- 'streaming': True,
43
- 'text_field': 'text',
44
- 'weight': 0.2,
45
- 'description': 'Synthetic textbooks and stories'
46
- },
47
- # 代码数据集
48
- 'codeparrot': {
49
- 'type': 'code',
50
- 'hf_path': 'bigcode/the-stack-smol',
51
- 'config': 'default',
52
- 'split': 'train',
53
- 'streaming': True,
54
- 'text_field': 'content',
55
- 'weight': 0.3,
56
- 'description': 'The Stack Smol - code'
57
- },
58
- 'the_stack': {
59
- 'type': 'code',
60
- 'hf_path': 'bigcode/the-stack-dedup',
61
- 'split': 'train',
62
- 'streaming': True,
63
- 'text_field': 'content',
64
- 'weight': 0.2,
65
- 'description': 'The Stack - deduplicated code'
66
- },
67
- # 多模态数据集
68
- 'laion400m': {
69
- 'type': 'image_text',
70
- 'hf_path': 'laion/laion400m',
71
- 'split': 'train',
72
- 'streaming': True,
73
- 'image_field': 'url',
74
- 'text_field': 'caption',
75
- 'weight': 0.4,
76
- 'description': 'LAION-400M image-text pairs'
77
- },
78
- 'conceptual_captions': {
79
- 'type': 'image_text',
80
- 'hf_path': 'google-research-datasets/conceptual_captions',
81
- 'split': 'train',
82
- 'streaming': False,
83
- 'image_field': 'image_url',
84
- 'text_field': 'caption',
85
- 'weight': 0.2,
86
- 'description': 'Conceptual Captions 3M'
87
- },
88
- }
89
-
90
- # 后训练数据集配置(instruction tuning + alignment)
91
- POSTTRAIN_DATASETS = {
92
- # Instruction Tuning数据集
93
- 'flan_v2': {
94
- 'type': 'instruction',
95
- 'hf_path': 'Muennighoff/flan',
96
- 'split': 'train',
97
- 'streaming': True,
98
- 'instruction_field': 'inputs',
99
- 'response_field': 'targets',
100
- 'weight': 1.0,
101
- 'max_samples': 100000,
102
- 'description': 'FLAN v2 collection'
103
- },
104
- 'alpaca': {
105
- 'type': 'instruction',
106
- 'hf_path': 'tatsu-lab/alpaca',
107
- 'split': 'train',
108
- 'streaming': False,
109
- 'instruction_field': 'instruction',
110
- 'input_field': 'input',
111
- 'response_field': 'output',
112
- 'weight': 0.5,
113
- 'description': 'Stanford Alpaca 52K'
114
- },
115
- 'dolly': {
116
- 'type': 'instruction',
117
- 'hf_path': 'databricks/databricks-dolly-15k',
118
- 'split': 'train',
119
- 'streaming': False,
120
- 'instruction_field': 'instruction',
121
- 'context_field': 'context', # Dolly有context字段
122
- 'response_field': 'response',
123
- 'weight': 0.3,
124
- 'description': 'Dolly 15K'
125
- },
126
- 'oasst1': {
127
- 'type': 'conversation',
128
- 'hf_path': 'OpenAssistant/oasst1',
129
- 'split': 'train',
130
- 'streaming': False,
131
- 'weight': 0.4,
132
- 'description': 'OpenAssistant Conversations',
133
- # OASST1需要特殊处理,因为它是树形结构
134
- # 可能需要自定义预处理
135
- },
136
- 'sharegpt': {
137
- 'type': 'conversation',
138
- 'hf_path': 'anon8231489123/ShareGPT_Vicuna_unfiltered',
139
- 'split': 'train',
140
- 'streaming': False,
141
- 'weight': 0.3,
142
- 'max_samples': 50000,
143
- 'description': 'ShareGPT conversations'
144
- },
145
- # Code instruction数据集
146
- 'code_alpaca': {
147
- 'type': 'code_instruction',
148
- 'hf_path': 'sahil2801/CodeAlpaca-20k',
149
- 'split': 'train',
150
- 'streaming': False,
151
- 'instruction_field': 'instruction',
152
- 'response_field': 'output',
153
- 'weight': 0.3,
154
- 'description': 'Code Alpaca 20K'
155
- },
156
- # 多模态instruction数据集
157
- 'llava_instruct': {
158
- 'type': 'multimodal_instruction',
159
- 'hf_path': 'liuhaotian/LLaVA-Instruct-150K',
160
- 'split': 'train',
161
- 'streaming': False,
162
- 'image_field': 'image',
163
- 'instruction_field': 'conversations',
164
- 'weight': 0.5,
165
- 'description': 'LLaVA visual instruction tuning'
166
- },
167
- # Preference数据集 (用于RLHF)
168
- 'hh_rlhf': {
169
- 'type': 'preference',
170
- 'hf_path': 'Anthropic/hh-rlhf',
171
- 'split': 'train',
172
- 'streaming': False,
173
- 'chosen_field': 'chosen',
174
- 'rejected_field': 'rejected',
175
- 'weight': 1.0,
176
- 'description': 'Anthropic HH-RLHF'
177
- },
178
- 'ultrafeedback': {
179
- 'type': 'preference',
180
- 'hf_path': 'openbmb/UltraFeedback',
181
- 'split': 'train',
182
- 'streaming': True,
183
- 'chosen_field': 'chosen', # 添加字段配置
184
- 'rejected_field': 'rejected',
185
- 'weight': 0.5,
186
- 'max_samples': 50000,
187
- 'description': 'UltraFeedback preferences'
188
- },
189
- 'debug_water': {
190
- 'type': 'instruction',
191
- 'hf_path': 'json', # 使用 json 加载器
192
- 'data_files': 'debug_water.json', # 指向刚才生成的文件
193
- 'split': 'train',
194
- 'streaming': False,
195
- 'instruction_field': 'instruction',
196
- 'response_field': 'output',
197
- 'weight': 1.0,
198
- 'description': 'Overfitting test for water'
199
- },
200
- }
201
-
202
- # 轻量级测试数据集(用于快速验证)
203
- TEST_DATASETS = {
204
- 'tiny_shakespeare': {
205
- 'type': 'text',
206
- 'hf_path': 'tiny_shakespeare',
207
- 'split': 'train',
208
- 'streaming': False,
209
- 'text_field': 'text',
210
- 'weight': 1.0,
211
- 'description': 'Tiny Shakespeare for testing'
212
- },
213
- 'gsm8k': {
214
- 'type': 'instruction',
215
- 'hf_path': 'gsm8k',
216
- 'config': 'main',
217
- 'split': 'train',
218
- 'streaming': False,
219
- 'instruction_field': 'question',
220
- 'response_field': 'answer',
221
- 'weight': 1.0,
222
- 'description': 'GSM8K math problems'
223
- },
224
- }
225
-
226
- # 数据集混合策略
227
- PRETRAIN_MIX = {
228
- 'default': {
229
- 'datasets': ['c4', 'wikipedia', 'bookcorpus', 'codeparrot'],
230
- 'weights': [0.5, 0.2, 0.2, 0.1],
231
- 'description': 'Default pretrain mix'
232
- },
233
- 'code_heavy': {
234
- 'datasets': ['c4', 'codeparrot', 'the_stack', 'wikipedia'],
235
- 'weights': [0.3, 0.4, 0.2, 0.1],
236
- 'description': 'Code-heavy mix'
237
- },
238
- 'multimodal': {
239
- 'datasets': ['c4', 'wikipedia', 'laion400m', 'conceptual_captions'],
240
- 'weights': [0.4, 0.2, 0.3, 0.1],
241
- 'description': 'Multimodal mix'
242
- },
243
- 'text_only': {
244
- 'datasets': ['c4', 'wikipedia', 'bookcorpus'],
245
- 'weights': [0.5, 0.3, 0.2],
246
- 'description': 'Text-only mix for testing'
247
- },
248
- }
249
-
250
- POSTTRAIN_MIX = {
251
- 'default': {
252
- 'datasets': ['flan_v2', 'alpaca', 'dolly', 'oasst1'],
253
- 'weights': [0.4, 0.3, 0.2, 0.1],
254
- 'description': 'Default instruction tuning mix'
255
- },
256
- 'conversation': {
257
- 'datasets': ['oasst1', 'sharegpt', 'alpaca'],
258
- 'weights': [0.4, 0.4, 0.2],
259
- 'description': 'Conversation-focused mix'
260
- },
261
- 'code_instruct': {
262
- 'datasets': ['code_alpaca', 'alpaca', 'flan_v2'],
263
- 'weights': [0.5, 0.3, 0.2],
264
- 'description': 'Code instruction mix'
265
- },
266
- 'simple_instruct': {
267
- 'datasets': ['alpaca', 'dolly'],
268
- 'weights': [0.6, 0.4],
269
- 'description': 'Simple instruction mix for testing'
270
- },
271
- 'debug_mix': {
272
- 'datasets': ['debug_water'],
273
- 'weights': [1.0],
274
- 'description': 'Debug mix for overfitting'
275
- },
276
- }
277
-
278
- # 下载和缓存配置
279
- DATASET_CACHE_DIR = "./dataset_cache"
280
- HF_CACHE_DIR = "./hf_cache"
281
- MAX_RETRIES = 3
282
- DOWNLOAD_TIMEOUT = 300
283
-
284
- # 数据处理配置
285
- PREPROCESSING_CONFIG = {
286
- 'max_seq_length': 2048,
287
- 'min_seq_length': 32,
288
- 'num_workers': 4,
289
- 'batch_size': 8,
290
- 'shuffle_buffer_size': 10000,
291
- 'seed': 42,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
  }
 
1
+ PRETRAIN_DATASETS = {
2
+ 'skypile_local': {
3
+ 'type': 'text',
4
+ 'hf_path': 'json',
5
+ 'data_files': [
6
+ '/root/dataset/2020-40_zh_head_0000.jsonl',
7
+ '/root/dataset/2020-40_zh_head_0001.jsonl',
8
+ '/root/dataset/2020-40_zh_head_0002.jsonl',
9
+ '/root/dataset/2020-40_zh_head_0003.jsonl'
10
+ ],
11
+ 'split': 'train',
12
+ 'streaming': False,
13
+ 'text_field': 'text',
14
+ 'weight': 1.0,
15
+ 'description': 'SkyPile-150B subset (local)'
16
+ },
17
+ 'the_pile': {
18
+ 'type': 'text',
19
+ 'hf_path': 'EleutherAI/pile',
20
+ 'split': 'train',
21
+ 'streaming': True,
22
+ 'text_field': 'text',
23
+ 'weight': 1.0,
24
+ 'description': 'The Pile - 825GB diverse text corpus'
25
+ },
26
+ 'pretrain_hq': {
27
+ 'type': 'text',
28
+ 'hf_path': 'json',
29
+ 'data_files': '/root/dataset/pretrain_hq.jsonl',
30
+ 'split': 'train',
31
+ 'streaming': False,
32
+ 'text_field': 'text',
33
+ 'weight': 1.0,
34
+ 'description': 'Custom high-quality pretrain dataset from local JSONL'
35
+ },
36
+ 'c4': {
37
+ 'type': 'text',
38
+ 'hf_path': 'allenai/c4',
39
+ 'config': 'en',
40
+ 'split': 'train',
41
+ 'streaming': True,
42
+ 'text_field': 'text',
43
+ 'weight': 0.5,
44
+ 'description': 'C4 - Colossal Clean Crawled Corpus'
45
+ },
46
+ 'wikipedia': {
47
+ 'type': 'text',
48
+ 'hf_path': 'HuggingFaceFW/fineweb-edu',
49
+ 'config': 'sample-10BT',
50
+ 'split': 'train',
51
+ 'streaming': True,
52
+ 'text_field': 'text',
53
+ 'weight': 0.3,
54
+ 'description': 'FineWeb Edu - High quality educational content'
55
+ },
56
+ 'bookcorpus': {
57
+ 'type': 'text',
58
+ 'hf_path': 'HuggingFaceTB/smollm-corpus',
59
+ 'config': 'cosmopedia-v2',
60
+ 'split': 'train',
61
+ 'streaming': True,
62
+ 'text_field': 'text',
63
+ 'weight': 0.2,
64
+ 'description': 'Synthetic textbooks and stories'
65
+ },
66
+ # 代码数据集
67
+ 'codeparrot': {
68
+ 'type': 'code',
69
+ 'hf_path': 'bigcode/the-stack-smol',
70
+ 'config': 'default',
71
+ 'split': 'train',
72
+ 'streaming': True,
73
+ 'text_field': 'content',
74
+ 'weight': 0.3,
75
+ 'description': 'The Stack Smol - code'
76
+ },
77
+ 'the_stack': {
78
+ 'type': 'code',
79
+ 'hf_path': 'bigcode/the-stack-dedup',
80
+ 'split': 'train',
81
+ 'streaming': True,
82
+ 'text_field': 'content',
83
+ 'weight': 0.2,
84
+ 'description': 'The Stack - deduplicated code'
85
+ },
86
+ # 多模态数据集
87
+ 'laion400m': {
88
+ 'type': 'image_text',
89
+ 'hf_path': 'laion/laion400m',
90
+ 'split': 'train',
91
+ 'streaming': True,
92
+ 'image_field': 'url',
93
+ 'text_field': 'caption',
94
+ 'weight': 0.4,
95
+ 'description': 'LAION-400M image-text pairs'
96
+ },
97
+ 'conceptual_captions': {
98
+ 'type': 'image_text',
99
+ 'hf_path': 'google-research-datasets/conceptual_captions',
100
+ 'split': 'train',
101
+ 'streaming': False,
102
+ 'image_field': 'image_url',
103
+ 'text_field': 'caption',
104
+ 'weight': 0.2,
105
+ 'description': 'Conceptual Captions 3M'
106
+ },
107
+ }
108
+ POSTTRAIN_DATASETS = {
109
+ 'r1_mix_dataset': {
110
+ 'type': 'conversation',
111
+ 'hf_path': 'json',
112
+ 'data_files': '/root/dataset/r1_mix_1024.jsonl',
113
+ 'split': 'train',
114
+ 'streaming': False,
115
+ 'weight': 1.0,
116
+ 'description': 'DeepSeek R1 Distill Mix (User/Assistant with <think>)'
117
+ },
118
+ 'minimind_sft': {
119
+ 'type': 'conversation',
120
+ 'hf_path': 'json',
121
+ 'data_files': './dataset/sft_mini.jsonl',
122
+ 'split': 'train',
123
+ 'streaming': False,
124
+ 'weight': 1.0,
125
+ 'max_samples': 100,
126
+ 'description': 'MiniMind Multi-turn SFT dataset'
127
+ },
128
+ 'self_en': {
129
+ 'type': 'conversation',
130
+ 'hf_path': 'json',
131
+ 'data_files': '/root/dataset/sft_en.jsonl',
132
+ 'split': 'train',
133
+ 'streaming': False,
134
+ 'weight': 1.0,
135
+ 'description': ' SFT_en dataset'
136
+ },
137
+ 'flan_v2': {
138
+ 'type': 'instruction',
139
+ 'hf_path': 'Muennighoff/flan',
140
+ 'split': 'train',
141
+ 'streaming': True,
142
+ 'instruction_field': 'inputs',
143
+ 'response_field': 'targets',
144
+ 'weight': 1.0,
145
+ 'max_samples': 100000,
146
+ 'description': 'FLAN v2 collection'
147
+ },
148
+ 'alpaca': {
149
+ 'type': 'instruction',
150
+ 'hf_path': 'tatsu-lab/alpaca',
151
+ 'split': 'train',
152
+ 'streaming': False,
153
+ 'instruction_field': 'instruction',
154
+ 'input_field': 'input',
155
+ 'response_field': 'output',
156
+ 'weight': 0.5,
157
+ 'description': 'Stanford Alpaca 52K'
158
+ },
159
+ 'dolly': {
160
+ 'type': 'instruction',
161
+ 'hf_path': 'databricks/databricks-dolly-15k',
162
+ 'split': 'train',
163
+ 'streaming': False,
164
+ 'instruction_field': 'instruction',
165
+ 'context_field': 'context',
166
+ 'response_field': 'response',
167
+ 'weight': 0.3,
168
+ 'description': 'Dolly 15K'
169
+ },
170
+ 'oasst1': {
171
+ 'type': 'conversation',
172
+ 'hf_path': 'OpenAssistant/oasst1',
173
+ 'split': 'train',
174
+ 'streaming': False,
175
+ 'weight': 0.4,
176
+ 'description': 'OpenAssistant Conversations'
177
+ },
178
+ 'sharegpt': {
179
+ 'type': 'conversation',
180
+ 'hf_path': 'anon8231489123/ShareGPT_Vicuna_unfiltered',
181
+ 'split': 'train',
182
+ 'streaming': False,
183
+ 'weight': 0.3,
184
+ 'max_samples': 50000,
185
+ 'description': 'ShareGPT conversations'
186
+ },
187
+ 'code_alpaca': {
188
+ 'type': 'code_instruction',
189
+ 'hf_path': 'sahil2801/CodeAlpaca-20k',
190
+ 'split': 'train',
191
+ 'streaming': False,
192
+ 'instruction_field': 'instruction',
193
+ 'response_field': 'output',
194
+ 'weight': 0.3,
195
+ 'description': 'Code Alpaca 20K'
196
+ },
197
+ 'llava_instruct': {
198
+ 'type': 'multimodal_instruction',
199
+ 'hf_path': 'liuhaotian/LLaVA-Instruct-150K',
200
+ 'split': 'train',
201
+ 'streaming': False,
202
+ 'image_field': 'image',
203
+ 'instruction_field': 'conversations',
204
+ 'weight': 0.5,
205
+ 'description': 'LLaVA visual instruction tuning'
206
+ },
207
+ 'hh_rlhf': {
208
+ 'type': 'preference',
209
+ 'hf_path': 'Anthropic/hh-rlhf',
210
+ 'split': 'train',
211
+ 'streaming': False,
212
+ 'chosen_field': 'chosen',
213
+ 'rejected_field': 'rejected',
214
+ 'weight': 1.0,
215
+ 'description': 'Anthropic HH-RLHF'
216
+ },
217
+ 'ultrafeedback': {
218
+ 'type': 'preference',
219
+ 'hf_path': 'openbmb/UltraFeedback',
220
+ 'split': 'train',
221
+ 'streaming': True,
222
+ 'chosen_field': 'chosen',
223
+ 'rejected_field': 'rejected',
224
+ 'weight': 0.5,
225
+ 'max_samples': 50000,
226
+ 'description': 'UltraFeedback preferences'
227
+ },
228
+ 'debug_water': {
229
+ 'type': 'instruction',
230
+ 'hf_path': 'json',
231
+ 'data_files': 'debug_water.json',
232
+ 'split': 'train',
233
+ 'streaming': False,
234
+ 'instruction_field': 'instruction',
235
+ 'response_field': 'output',
236
+ 'weight': 1.0,
237
+ 'description': 'Overfitting test for water'
238
+ },
239
+ 'grpo_preferences_local': {
240
+ 'type': 'preference',
241
+ 'hf_path': 'json',
242
+ 'data_files': '/root/dataset/grpo_preferences.jsonl',
243
+ 'split': 'train',
244
+ 'streaming': False,
245
+ 'chosen_field': 'chosen',
246
+ 'rejected_field': 'rejected',
247
+ 'weight': 1.0,
248
+ 'description': 'Local GRPO preference pairs'
249
+ },
250
+ 'gsm8k_zh': {
251
+ 'type': 'instruction',
252
+ 'hf_path': 'json',
253
+ 'data_files': '/root/dataset/gsm8k_zh_train.jsonl',
254
+ 'split': 'train',
255
+ 'streaming': False,
256
+ 'instruction_field': 'question_zh',
257
+ 'response_field': 'answer_zh',
258
+ 'weight': 1.0,
259
+ 'description': 'GSM8K Chinese math reasoning dataset'
260
+ },
261
+ }
262
+
263
+ GRPO_DATASETS = {
264
+ 'grpo_prompts_hh': {
265
+ 'type': 'prompt',
266
+ 'hf_path': 'json',
267
+ 'data_files': '/root/dataset/grpo_prompts_hh.jsonl',
268
+ 'split': 'train',
269
+ 'streaming': False,
270
+ 'prompt_field': 'prompt',
271
+ 'weight': 1.0,
272
+ 'description': 'HH-RLHF prompts for GRPO generation'
273
+ },
274
+ 'grpo_prompts_alpaca': {
275
+ 'type': 'prompt',
276
+ 'hf_path': 'json',
277
+ 'data_files': '/root/dataset/grpo_prompts_alpaca.jsonl',
278
+ 'split': 'train',
279
+ 'streaming': False,
280
+ 'prompt_field': 'prompt',
281
+ 'weight': 0.5,
282
+ 'description': 'Alpaca-style prompts for GRPO'
283
+ },
284
+ 'grpo_prompts_simple': {
285
+ 'type': 'prompt',
286
+ 'hf_path': 'json',
287
+ 'data_files': '/root/dataset/grpo_prompts_simple.jsonl',
288
+ 'split': 'train',
289
+ 'streaming': False,
290
+ 'prompt_field': 'prompt',
291
+ 'weight': 0.1,
292
+ 'description': 'Simple test prompts'
293
+ },
294
+ }
295
+
296
+ GRPO_PROMPT_MIX = {
297
+ 'default': {
298
+ 'datasets': ['grpo_prompts_hh'],
299
+ 'weights': [1.0],
300
+ 'description': 'Default GRPO prompt mix'
301
+ },
302
+ 'hh_only': {
303
+ 'datasets': ['grpo_prompts_hh'],
304
+ 'weights': [1.0],
305
+ 'description': 'HH-RLHF prompts only'
306
+ },
307
+ 'alpaca_only': {
308
+ 'datasets': ['grpo_prompts_alpaca'],
309
+ 'weights': [1.0],
310
+ 'description': 'Alpaca prompts only'
311
+ },
312
+ 'test': {
313
+ 'datasets': ['grpo_prompts_simple'],
314
+ 'weights': [1.0],
315
+ 'description': 'Simple test prompts'
316
+ },
317
+ }
318
+
319
+ TEST_DATASETS = {
320
+ 'tiny_shakespeare': {
321
+ 'type': 'text',
322
+ 'hf_path': 'tiny_shakespeare',
323
+ 'split': 'train',
324
+ 'streaming': False,
325
+ 'text_field': 'text',
326
+ 'weight': 1.0,
327
+ 'description': 'Tiny Shakespeare for testing'
328
+ },
329
+ 'gsm8k': {
330
+ 'type': 'instruction',
331
+ 'hf_path': 'gsm8k',
332
+ 'config': 'main',
333
+ 'split': 'train',
334
+ 'streaming': False,
335
+ 'instruction_field': 'question',
336
+ 'response_field': 'answer',
337
+ 'weight': 1.0,
338
+ 'description': 'GSM8K math problems'
339
+ },
340
+ }
341
+
342
+ PRETRAIN_MIX = {
343
+ 'default': {
344
+ 'datasets': ['c4', 'wikipedia', 'bookcorpus', 'codeparrot'],
345
+ 'weights': [0.5, 0.2, 0.2, 0.1],
346
+ 'description': 'Default pretrain mix'
347
+ },
348
+ 'code_heavy': {
349
+ 'datasets': ['c4', 'codeparrot', 'the_stack', 'wikipedia'],
350
+ 'weights': [0.3, 0.4, 0.2, 0.1],
351
+ 'description': 'Code-heavy mix'
352
+ },
353
+ 'multimodal': {
354
+ 'datasets': ['c4', 'wikipedia', 'laion400m', 'conceptual_captions'],
355
+ 'weights': [0.4, 0.2, 0.3, 0.1],
356
+ 'description': 'Multimodal mix'
357
+ },
358
+ 'text_only': {
359
+ 'datasets': ['c4', 'wikipedia', 'bookcorpus'],
360
+ 'weights': [0.5, 0.3, 0.2],
361
+ 'description': 'Text-only mix for testing'
362
+ },
363
+ 'custom_hq': {
364
+ 'datasets': ['pretrain_hq'],
365
+ 'weights': [1.0],
366
+ 'description': 'Custom mix using local pretrain_hq.jsonl'
367
+ },
368
+ 'skypile_training': {
369
+ 'datasets': ['skypile_local'],
370
+ 'weights': [1.0],
371
+ 'description': 'Pure pre-training on SkyPile data'
372
+ },
373
+ }
374
+
375
+ POSTTRAIN_MIX = {
376
+ 'default': {
377
+ 'datasets': ['flan_v2', 'alpaca', 'dolly', 'oasst1'],
378
+ 'weights': [0.4, 0.3, 0.2, 0.1],
379
+ 'description': 'Default instruction tuning mix'
380
+ },
381
+ 'conversation': {
382
+ 'datasets': ['oasst1', 'sharegpt', 'alpaca'],
383
+ 'weights': [0.4, 0.4, 0.2],
384
+ 'description': 'Conversation-focused mix'
385
+ },
386
+ 'code_instruct': {
387
+ 'datasets': ['code_alpaca', 'alpaca', 'flan_v2'],
388
+ 'weights': [0.5, 0.3, 0.2],
389
+ 'description': 'Code instruction mix'
390
+ },
391
+ 'simple_instruct': {
392
+ 'datasets': ['alpaca', 'dolly'],
393
+ 'weights': [0.6, 0.4],
394
+ 'description': 'Simple instruction mix for testing'
395
+ },
396
+ 'minimind_mix': {
397
+ 'datasets': ['minimind_sft', 'self_en'],
398
+ 'weights': [0.01, 0.99],
399
+ 'description': 'Fine-tuning on MiniMind dataset'
400
+ },
401
+ 'r1_mix_strategy': {
402
+ 'datasets': ['r1_mix_dataset'],
403
+ 'weights': [1.0],
404
+ 'description': 'Fine-tuning on R1 Distill dataset'
405
+ },
406
+ 'gsm8k_zh_mix': {
407
+ 'datasets': ['gsm8k_zh'],
408
+ 'weights': [1.0],
409
+ 'description': 'Fine-tuning on GSM8K Chinese math reasoning dataset'
410
+ },
411
+ 'think_math_mix': {
412
+ 'datasets': ['r1_mix_dataset', 'gsm8k_zh'],
413
+ 'weights': [0.7, 0.3],
414
+ 'description': 'Mix of R1 Distill and GSM8K Chinese for math reasoning'
415
+ },
416
+ }
417
+
418
+ DATASET_CACHE_DIR = "./dataset_cache"
419
+ HF_CACHE_DIR = "./hf_cache"
420
+ MAX_RETRIES = 3
421
+ DOWNLOAD_TIMEOUT = 300
422
+
423
+ PREPROCESSING_CONFIG = {
424
+ 'max_seq_length': 2048,
425
+ 'min_seq_length': 32,
426
+ 'num_workers': 4,
427
+ 'batch_size': 8,
428
+ 'shuffle_buffer_size': 10000,
429
+ 'seed': 42,
430
  }