szxllm
/

MultiModal

Model card Files Files and versions

xet

Community

szxllm commited on Feb 25

Commit

6d0972d

verified ·

1 Parent(s): 71c3881

Update data_config.py

Browse files

Files changed (1) hide show

data_config.py +429 -291

data_config.py CHANGED Viewed

@@ -1,292 +1,430 @@
-# data_config.py
-"""
-预训练和后训练数据集配置
-"""
-PRETRAIN_DATASETS = {
-    # 文本数据集
-    'the_pile': {
-        'type': 'text',
-        'hf_path': 'EleutherAI/pile',
-        'split': 'train',
-        'streaming': True,
-        'text_field': 'text',
-        'weight': 1.0,
-        'description': 'The Pile - 825GB diverse text corpus'
-    },
-    'c4': {
-        'type': 'text',
-        'hf_path': 'allenai/c4',
-        'config': 'en',
-        'split': 'train',
-        'streaming': True,
-        'text_field': 'text',
-        'weight': 0.5,
-        'description': 'C4 - Colossal Clean Crawled Corpus'
-    },
-    'wikipedia': {
-        'type': 'text',
-        'hf_path': 'HuggingFaceFW/fineweb-edu',
-        'config': 'sample-10BT',
-        'split': 'train',
-        'streaming': True,
-        'text_field': 'text',
-        'weight': 0.3,
-        'description': 'FineWeb Edu - High quality educational content'
-    },
-    'bookcorpus': {
-        'type': 'text',
-        'hf_path': 'HuggingFaceTB/smollm-corpus',
-        'config': 'cosmopedia-v2',
-        'split': 'train',
-        'streaming': True,
-        'text_field': 'text',
-        'weight': 0.2,
-        'description': 'Synthetic textbooks and stories'
-    },
-    # 代码数据集
-    'codeparrot': {
-        'type': 'code',
-        'hf_path': 'bigcode/the-stack-smol',
-        'config': 'default',
-        'split': 'train',
-        'streaming': True,
-        'text_field': 'content',
-        'weight': 0.3,
-        'description': 'The Stack Smol - code'
-    },
-    'the_stack': {
-        'type': 'code',
-        'hf_path': 'bigcode/the-stack-dedup',
-        'split': 'train',
-        'streaming': True,
-        'text_field': 'content',
-        'weight': 0.2,
-        'description': 'The Stack - deduplicated code'
-    },
-    # 多模态数据集
-    'laion400m': {
-        'type': 'image_text',
-        'hf_path': 'laion/laion400m',
-        'split': 'train',
-        'streaming': True,
-        'image_field': 'url',
-        'text_field': 'caption',
-        'weight': 0.4,
-        'description': 'LAION-400M image-text pairs'
-    },
-    'conceptual_captions': {
-        'type': 'image_text',
-        'hf_path': 'google-research-datasets/conceptual_captions',
-        'split': 'train',
-        'streaming': False,
-        'image_field': 'image_url',
-        'text_field': 'caption',
-        'weight': 0.2,
-        'description': 'Conceptual Captions 3M'
-    },
-}
-# 后训练数据集配置（instruction tuning + alignment）
-POSTTRAIN_DATASETS = {
-    # Instruction Tuning数据集
-    'flan_v2': {
-        'type': 'instruction',
-        'hf_path': 'Muennighoff/flan',
-        'split': 'train',
-        'streaming': True,
-        'instruction_field': 'inputs',
-        'response_field': 'targets',
-        'weight': 1.0,
-        'max_samples': 100000,
-        'description': 'FLAN v2 collection'
-    },
-    'alpaca': {
-        'type': 'instruction',
-        'hf_path': 'tatsu-lab/alpaca',
-        'split': 'train',
-        'streaming': False,
-        'instruction_field': 'instruction',
-        'input_field': 'input',
-        'response_field': 'output',
-        'weight': 0.5,
-        'description': 'Stanford Alpaca 52K'
-    },
-    'dolly': {
-        'type': 'instruction',
-        'hf_path': 'databricks/databricks-dolly-15k',
-        'split': 'train',
-        'streaming': False,
-        'instruction_field': 'instruction',
-        'context_field': 'context',  # Dolly有context字段
-        'response_field': 'response',
-        'weight': 0.3,
-        'description': 'Dolly 15K'
-    },
-    'oasst1': {
-        'type': 'conversation',
-        'hf_path': 'OpenAssistant/oasst1',
-        'split': 'train',
-        'streaming': False,
-        'weight': 0.4,
-        'description': 'OpenAssistant Conversations',
-        # OASST1需要特殊处理，因为它是树形结构
-        # 可能需要自定义预处理
-    },
-    'sharegpt': {
-        'type': 'conversation',
-        'hf_path': 'anon8231489123/ShareGPT_Vicuna_unfiltered',
-        'split': 'train',
-        'streaming': False,
-        'weight': 0.3,
-        'max_samples': 50000,
-        'description': 'ShareGPT conversations'
-    },
-    # Code instruction数据集
-    'code_alpaca': {
-        'type': 'code_instruction',
-        'hf_path': 'sahil2801/CodeAlpaca-20k',
-        'split': 'train',
-        'streaming': False,
-        'instruction_field': 'instruction',
-        'response_field': 'output',
-        'weight': 0.3,
-        'description': 'Code Alpaca 20K'
-    },
-    # 多模态instruction数据集
-    'llava_instruct': {
-        'type': 'multimodal_instruction',
-        'hf_path': 'liuhaotian/LLaVA-Instruct-150K',
-        'split': 'train',
-        'streaming': False,
-        'image_field': 'image',
-        'instruction_field': 'conversations',
-        'weight': 0.5,
-        'description': 'LLaVA visual instruction tuning'
-    },
-    # Preference数据集 (用于RLHF)
-    'hh_rlhf': {
-        'type': 'preference',
-        'hf_path': 'Anthropic/hh-rlhf',
-        'split': 'train',
-        'streaming': False,
-        'chosen_field': 'chosen',
-        'rejected_field': 'rejected',
-        'weight': 1.0,
-        'description': 'Anthropic HH-RLHF'
-    },
-    'ultrafeedback': {
-        'type': 'preference',
-        'hf_path': 'openbmb/UltraFeedback',
-        'split': 'train',
-        'streaming': True,
-        'chosen_field': 'chosen',  # 添加字段配置
-        'rejected_field': 'rejected',
-        'weight': 0.5,
-        'max_samples': 50000,
-        'description': 'UltraFeedback preferences'
-    },
-     'debug_water': {
-        'type': 'instruction',
-        'hf_path': 'json',              # 使用 json 加载器
-        'data_files': 'debug_water.json', # 指向刚才生成的文件
-        'split': 'train',
-        'streaming': False,
-        'instruction_field': 'instruction',
-        'response_field': 'output',
-        'weight': 1.0,
-        'description': 'Overfitting test for water'
-    },
-}
-# 轻量级测试数据集（用于快速验证）
-TEST_DATASETS = {
-    'tiny_shakespeare': {
-        'type': 'text',
-        'hf_path': 'tiny_shakespeare',
-        'split': 'train',
-        'streaming': False,
-        'text_field': 'text',
-        'weight': 1.0,
-        'description': 'Tiny Shakespeare for testing'
-    },
-    'gsm8k': {
-        'type': 'instruction',
-        'hf_path': 'gsm8k',
-        'config': 'main',
-        'split': 'train',
-        'streaming': False,
-        'instruction_field': 'question',
-        'response_field': 'answer',
-        'weight': 1.0,
-        'description': 'GSM8K math problems'
-    },
-}
-# 数据集混合策略
-PRETRAIN_MIX = {
-    'default': {
-        'datasets': ['c4', 'wikipedia', 'bookcorpus', 'codeparrot'],
-        'weights': [0.5, 0.2, 0.2, 0.1],
-        'description': 'Default pretrain mix'
-    },
-    'code_heavy': {
-        'datasets': ['c4', 'codeparrot', 'the_stack', 'wikipedia'],
-        'weights': [0.3, 0.4, 0.2, 0.1],
-        'description': 'Code-heavy mix'
-    },
-    'multimodal': {
-        'datasets': ['c4', 'wikipedia', 'laion400m', 'conceptual_captions'],
-        'weights': [0.4, 0.2, 0.3, 0.1],
-        'description': 'Multimodal mix'
-    },
-    'text_only': {
-        'datasets': ['c4', 'wikipedia', 'bookcorpus'],
-        'weights': [0.5, 0.3, 0.2],
-        'description': 'Text-only mix for testing'
-    },
-}
-POSTTRAIN_MIX = {
-    'default': {
-        'datasets': ['flan_v2', 'alpaca', 'dolly', 'oasst1'],
-        'weights': [0.4, 0.3, 0.2, 0.1],
-        'description': 'Default instruction tuning mix'
-    },
-    'conversation': {
-        'datasets': ['oasst1', 'sharegpt', 'alpaca'],
-        'weights': [0.4, 0.4, 0.2],
-        'description': 'Conversation-focused mix'
-    },
-    'code_instruct': {
-        'datasets': ['code_alpaca', 'alpaca', 'flan_v2'],
-        'weights': [0.5, 0.3, 0.2],
-        'description': 'Code instruction mix'
-    },
-    'simple_instruct': {
-        'datasets': ['alpaca', 'dolly'],
-        'weights': [0.6, 0.4],
-        'description': 'Simple instruction mix for testing'
-    },
-    'debug_mix': {
-        'datasets': ['debug_water'],
-        'weights': [1.0],
-        'description': 'Debug mix for overfitting'
-    },
-}
-# 下载和缓存配置
-DATASET_CACHE_DIR = "./dataset_cache"
-HF_CACHE_DIR = "./hf_cache"
-MAX_RETRIES = 3
-DOWNLOAD_TIMEOUT = 300
-# 数据处理配置
-PREPROCESSING_CONFIG = {
-    'max_seq_length': 2048,
-    'min_seq_length': 32,
-    'num_workers': 4,
-    'batch_size': 8,
-    'shuffle_buffer_size': 10000,
-    'seed': 42,
 }

+PRETRAIN_DATASETS = {
+    'skypile_local': {
+        'type': 'text',
+        'hf_path': 'json',
+        'data_files': [
+            '/root/dataset/2020-40_zh_head_0000.jsonl',
+            '/root/dataset/2020-40_zh_head_0001.jsonl',
+            '/root/dataset/2020-40_zh_head_0002.jsonl',
+            '/root/dataset/2020-40_zh_head_0003.jsonl'
+        ],
+        'split': 'train',
+        'streaming': False,
+        'text_field': 'text',
+        'weight': 1.0,
+        'description': 'SkyPile-150B subset (local)'
+    },
+    'the_pile': {
+        'type': 'text',
+        'hf_path': 'EleutherAI/pile',
+        'split': 'train',
+        'streaming': True,
+        'text_field': 'text',
+        'weight': 1.0,
+        'description': 'The Pile - 825GB diverse text corpus'
+    },
+    'pretrain_hq': {
+        'type': 'text',
+        'hf_path': 'json',
+        'data_files': '/root/dataset/pretrain_hq.jsonl',
+        'split': 'train',
+        'streaming': False,
+        'text_field': 'text',
+        'weight': 1.0,
+        'description': 'Custom high-quality pretrain dataset from local JSONL'
+    },
+    'c4': {
+        'type': 'text',
+        'hf_path': 'allenai/c4',
+        'config': 'en',
+        'split': 'train',
+        'streaming': True,
+        'text_field': 'text',
+        'weight': 0.5,
+        'description': 'C4 - Colossal Clean Crawled Corpus'
+    },
+    'wikipedia': {
+        'type': 'text',
+        'hf_path': 'HuggingFaceFW/fineweb-edu',
+        'config': 'sample-10BT',
+        'split': 'train',
+        'streaming': True,
+        'text_field': 'text',
+        'weight': 0.3,
+        'description': 'FineWeb Edu - High quality educational content'
+    },
+    'bookcorpus': {
+        'type': 'text',
+        'hf_path': 'HuggingFaceTB/smollm-corpus',
+        'config': 'cosmopedia-v2',
+        'split': 'train',
+        'streaming': True,
+        'text_field': 'text',
+        'weight': 0.2,
+        'description': 'Synthetic textbooks and stories'
+    },
+    # 代码数据集
+    'codeparrot': {
+        'type': 'code',
+        'hf_path': 'bigcode/the-stack-smol',
+        'config': 'default',
+        'split': 'train',
+        'streaming': True,
+        'text_field': 'content',
+        'weight': 0.3,
+        'description': 'The Stack Smol - code'
+    },
+    'the_stack': {
+        'type': 'code',
+        'hf_path': 'bigcode/the-stack-dedup',
+        'split': 'train',
+        'streaming': True,
+        'text_field': 'content',
+        'weight': 0.2,
+        'description': 'The Stack - deduplicated code'
+    },
+    # 多模态数据集
+    'laion400m': {
+        'type': 'image_text',
+        'hf_path': 'laion/laion400m',
+        'split': 'train',
+        'streaming': True,
+        'image_field': 'url',
+        'text_field': 'caption',
+        'weight': 0.4,
+        'description': 'LAION-400M image-text pairs'
+    },
+    'conceptual_captions': {
+        'type': 'image_text',
+        'hf_path': 'google-research-datasets/conceptual_captions',
+        'split': 'train',
+        'streaming': False,
+        'image_field': 'image_url',
+        'text_field': 'caption',
+        'weight': 0.2,
+        'description': 'Conceptual Captions 3M'
+    },
+}
+POSTTRAIN_DATASETS = {
+    'r1_mix_dataset': {
+        'type': 'conversation',
+        'hf_path': 'json',
+        'data_files': '/root/dataset/r1_mix_1024.jsonl',
+        'split': 'train',
+        'streaming': False,
+        'weight': 1.0,
+        'description': 'DeepSeek R1 Distill Mix (User/Assistant with <think>)'
+    },
+    'minimind_sft': {
+        'type': 'conversation',
+        'hf_path': 'json',
+        'data_files': './dataset/sft_mini.jsonl',
+        'split': 'train',
+        'streaming': False,
+        'weight': 1.0,
+        'max_samples': 100,
+        'description': 'MiniMind Multi-turn SFT dataset'
+    },
+    'self_en': {
+        'type': 'conversation',
+        'hf_path': 'json',
+        'data_files': '/root/dataset/sft_en.jsonl',
+        'split': 'train',
+        'streaming': False,
+        'weight': 1.0,
+        'description': ' SFT_en dataset'
+    },
+    'flan_v2': {
+        'type': 'instruction',
+        'hf_path': 'Muennighoff/flan',
+        'split': 'train',
+        'streaming': True,
+        'instruction_field': 'inputs',
+        'response_field': 'targets',
+        'weight': 1.0,
+        'max_samples': 100000,
+        'description': 'FLAN v2 collection'
+    },
+    'alpaca': {
+        'type': 'instruction',
+        'hf_path': 'tatsu-lab/alpaca',
+        'split': 'train',
+        'streaming': False,
+        'instruction_field': 'instruction',
+        'input_field': 'input',
+        'response_field': 'output',
+        'weight': 0.5,
+        'description': 'Stanford Alpaca 52K'
+    },
+    'dolly': {
+        'type': 'instruction',
+        'hf_path': 'databricks/databricks-dolly-15k',
+        'split': 'train',
+        'streaming': False,
+        'instruction_field': 'instruction',
+        'context_field': 'context',
+        'response_field': 'response',
+        'weight': 0.3,
+        'description': 'Dolly 15K'
+    },
+    'oasst1': {
+        'type': 'conversation',
+        'hf_path': 'OpenAssistant/oasst1',
+        'split': 'train',
+        'streaming': False,
+        'weight': 0.4,
+        'description': 'OpenAssistant Conversations'
+    },
+    'sharegpt': {
+        'type': 'conversation',
+        'hf_path': 'anon8231489123/ShareGPT_Vicuna_unfiltered',
+        'split': 'train',
+        'streaming': False,
+        'weight': 0.3,
+        'max_samples': 50000,
+        'description': 'ShareGPT conversations'
+    },
+    'code_alpaca': {
+        'type': 'code_instruction',
+        'hf_path': 'sahil2801/CodeAlpaca-20k',
+        'split': 'train',
+        'streaming': False,
+        'instruction_field': 'instruction',
+        'response_field': 'output',
+        'weight': 0.3,
+        'description': 'Code Alpaca 20K'
+    },
+    'llava_instruct': {
+        'type': 'multimodal_instruction',
+        'hf_path': 'liuhaotian/LLaVA-Instruct-150K',
+        'split': 'train',
+        'streaming': False,
+        'image_field': 'image',
+        'instruction_field': 'conversations',
+        'weight': 0.5,
+        'description': 'LLaVA visual instruction tuning'
+    },
+    'hh_rlhf': {
+        'type': 'preference',
+        'hf_path': 'Anthropic/hh-rlhf',
+        'split': 'train',
+        'streaming': False,
+        'chosen_field': 'chosen',
+        'rejected_field': 'rejected',
+        'weight': 1.0,
+        'description': 'Anthropic HH-RLHF'
+    },
+    'ultrafeedback': {
+        'type': 'preference',
+        'hf_path': 'openbmb/UltraFeedback',
+        'split': 'train',
+        'streaming': True,
+        'chosen_field': 'chosen',
+        'rejected_field': 'rejected',
+        'weight': 0.5,
+        'max_samples': 50000,
+        'description': 'UltraFeedback preferences'
+    },
+    'debug_water': {
+        'type': 'instruction',
+        'hf_path': 'json',
+        'data_files': 'debug_water.json',
+        'split': 'train',
+        'streaming': False,
+        'instruction_field': 'instruction',
+        'response_field': 'output',
+        'weight': 1.0,
+        'description': 'Overfitting test for water'
+    },
+    'grpo_preferences_local': {
+        'type': 'preference',
+        'hf_path': 'json',
+        'data_files': '/root/dataset/grpo_preferences.jsonl',
+        'split': 'train',
+        'streaming': False,
+        'chosen_field': 'chosen',
+        'rejected_field': 'rejected',
+        'weight': 1.0,
+        'description': 'Local GRPO preference pairs'
+    },
+    'gsm8k_zh': {
+        'type': 'instruction',
+        'hf_path': 'json',
+        'data_files': '/root/dataset/gsm8k_zh_train.jsonl',
+        'split': 'train',
+        'streaming': False,
+        'instruction_field': 'question_zh',
+        'response_field': 'answer_zh',
+        'weight': 1.0,
+        'description': 'GSM8K Chinese math reasoning dataset'
+    },
+}
+GRPO_DATASETS = {
+    'grpo_prompts_hh': {
+        'type': 'prompt',
+        'hf_path': 'json',
+        'data_files': '/root/dataset/grpo_prompts_hh.jsonl',
+        'split': 'train',
+        'streaming': False,
+        'prompt_field': 'prompt',
+        'weight': 1.0,
+        'description': 'HH-RLHF prompts for GRPO generation'
+    },
+    'grpo_prompts_alpaca': {
+        'type': 'prompt',
+        'hf_path': 'json',
+        'data_files': '/root/dataset/grpo_prompts_alpaca.jsonl',
+        'split': 'train',
+        'streaming': False,
+        'prompt_field': 'prompt',
+        'weight': 0.5,
+        'description': 'Alpaca-style prompts for GRPO'
+    },
+    'grpo_prompts_simple': {
+        'type': 'prompt',
+        'hf_path': 'json',
+        'data_files': '/root/dataset/grpo_prompts_simple.jsonl',
+        'split': 'train',
+        'streaming': False,
+        'prompt_field': 'prompt',
+        'weight': 0.1,
+        'description': 'Simple test prompts'
+    },
+}
+GRPO_PROMPT_MIX = {
+    'default': {
+        'datasets': ['grpo_prompts_hh'],
+        'weights': [1.0],
+        'description': 'Default GRPO prompt mix'
+    },
+    'hh_only': {
+        'datasets': ['grpo_prompts_hh'],
+        'weights': [1.0],
+        'description': 'HH-RLHF prompts only'
+    },
+    'alpaca_only': {
+        'datasets': ['grpo_prompts_alpaca'],
+        'weights': [1.0],
+        'description': 'Alpaca prompts only'
+    },
+    'test': {
+        'datasets': ['grpo_prompts_simple'],
+        'weights': [1.0],
+        'description': 'Simple test prompts'
+    },
+}
+TEST_DATASETS = {
+    'tiny_shakespeare': {
+        'type': 'text',
+        'hf_path': 'tiny_shakespeare',
+        'split': 'train',
+        'streaming': False,
+        'text_field': 'text',
+        'weight': 1.0,
+        'description': 'Tiny Shakespeare for testing'
+    },
+    'gsm8k': {
+        'type': 'instruction',
+        'hf_path': 'gsm8k',
+        'config': 'main',
+        'split': 'train',
+        'streaming': False,
+        'instruction_field': 'question',
+        'response_field': 'answer',
+        'weight': 1.0,
+        'description': 'GSM8K math problems'
+    },
+}
+PRETRAIN_MIX = {
+    'default': {
+        'datasets': ['c4', 'wikipedia', 'bookcorpus', 'codeparrot'],
+        'weights': [0.5, 0.2, 0.2, 0.1],
+        'description': 'Default pretrain mix'
+    },
+    'code_heavy': {
+        'datasets': ['c4', 'codeparrot', 'the_stack', 'wikipedia'],
+        'weights': [0.3, 0.4, 0.2, 0.1],
+        'description': 'Code-heavy mix'
+    },
+    'multimodal': {
+        'datasets': ['c4', 'wikipedia', 'laion400m', 'conceptual_captions'],
+        'weights': [0.4, 0.2, 0.3, 0.1],
+        'description': 'Multimodal mix'
+    },
+    'text_only': {
+        'datasets': ['c4', 'wikipedia', 'bookcorpus'],
+        'weights': [0.5, 0.3, 0.2],
+        'description': 'Text-only mix for testing'
+    },
+    'custom_hq': {
+        'datasets': ['pretrain_hq'],
+        'weights': [1.0],
+        'description': 'Custom mix using local pretrain_hq.jsonl'
+    },
+    'skypile_training': {
+        'datasets': ['skypile_local'],
+        'weights': [1.0],
+        'description': 'Pure pre-training on SkyPile data'
+    },
+}
+POSTTRAIN_MIX = {
+    'default': {
+        'datasets': ['flan_v2', 'alpaca', 'dolly', 'oasst1'],
+        'weights': [0.4, 0.3, 0.2, 0.1],
+        'description': 'Default instruction tuning mix'
+    },
+    'conversation': {
+        'datasets': ['oasst1', 'sharegpt', 'alpaca'],
+        'weights': [0.4, 0.4, 0.2],
+        'description': 'Conversation-focused mix'
+    },
+    'code_instruct': {
+        'datasets': ['code_alpaca', 'alpaca', 'flan_v2'],
+        'weights': [0.5, 0.3, 0.2],
+        'description': 'Code instruction mix'
+    },
+    'simple_instruct': {
+        'datasets': ['alpaca', 'dolly'],
+        'weights': [0.6, 0.4],
+        'description': 'Simple instruction mix for testing'
+    },
+    'minimind_mix': {
+        'datasets': ['minimind_sft', 'self_en'],
+        'weights': [0.01, 0.99],
+        'description': 'Fine-tuning on MiniMind dataset'
+    },
+    'r1_mix_strategy': {
+        'datasets': ['r1_mix_dataset'],
+        'weights': [1.0],
+        'description': 'Fine-tuning on R1 Distill dataset'
+    },
+    'gsm8k_zh_mix': {
+        'datasets': ['gsm8k_zh'],
+        'weights': [1.0],
+        'description': 'Fine-tuning on GSM8K Chinese math reasoning dataset'
+    },
+    'think_math_mix': {
+        'datasets': ['r1_mix_dataset', 'gsm8k_zh'],
+        'weights': [0.7, 0.3],
+        'description': 'Mix of R1 Distill and GSM8K Chinese for math reasoning'
+    },
+}
+DATASET_CACHE_DIR = "./dataset_cache"
+HF_CACHE_DIR = "./hf_cache"
+MAX_RETRIES = 3
+DOWNLOAD_TIMEOUT = 300
+PREPROCESSING_CONFIG = {
+    'max_seq_length': 2048,
+    'min_seq_length': 32,
+    'num_workers': 4,
+    'batch_size': 8,
+    'shuffle_buffer_size': 10000,
+    'seed': 42,
 }