szxllm
/

MultiModal

Model card Files Files and versions

xet

Community

szxllm commited on Feb 25

Commit

b7e4db5

verified ·

1 Parent(s): 6d0972d

Update data_loader.py

Browse files

Files changed (1) hide show

data_loader.py +179 -241

data_loader.py CHANGED Viewed

@@ -12,8 +12,7 @@ import requests
 from io import BytesIO
 from torchvision import transforms
 import logging
-# 设置日志
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -29,8 +28,6 @@ from data_config import (
     DATASET_CACHE_DIR,
     HF_CACHE_DIR
 )
-# 图像变换
 image_transform = transforms.Compose([
     transforms.Resize((224, 224)),
     transforms.ToTensor(),
@@ -59,7 +56,6 @@ class PreTrainDataset(IterableDataset):
         self.max_samples = max_samples
         self.samples_generated = 0
-        # 获取混合配置
         if mix_name not in PRETRAIN_MIX:
             raise ValueError(f"Unknown mix: {mix_name}. Available: {list(PRETRAIN_MIX.keys())}")
@@ -69,12 +65,6 @@ class PreTrainDataset(IterableDataset):
         if not dataset_names:
             raise ValueError(f"No datasets found in mix: {mix_name}")
-        logger.info(f"Loading pretrain mix: {mix_name}")
-        logger.info(f"  Datasets: {dataset_names}")
-        logger.info(f"  Weights: {weights}")
-        # 加载数据集
         self.datasets = []
         self.probabilities = []
@@ -97,7 +87,6 @@ class PreTrainDataset(IterableDataset):
         if not self.datasets:
             raise ValueError("No datasets loaded successfully")
-        # 归一化概率
         total = sum(self.probabilities)
         self.probabilities = [p / total for p in self.probabilities]
@@ -111,14 +100,34 @@ class PreTrainDataset(IterableDataset):
                 'streaming': config.get('streaming', self.streaming),
                 'cache_dir': HF_CACHE_DIR,
             }
             if 'config' in config:
                 load_kwargs['name'] = config['config']
             ds = load_dataset(**load_kwargs)
             return ds
         except Exception as e:
             logger.error(f"Failed to load {config.get('hf_path', 'unknown')}: {e}")
             return None
     def _process_text_sample(self, sample: Dict, config: Dict) -> Optional[Dict]:
@@ -130,21 +139,25 @@ class PreTrainDataset(IterableDataset):
                 return None
             text = text.strip()
-            if len(text) < 10:
                 return None
-            # Tokenize
             encoding = self.tokenizer(
                 text,
-                max_length=self.max_length,
                 truncation=True,
-                padding='max_length',
-                return_tensors='pt'
             )
             return {
-                'input_ids': encoding['input_ids'].squeeze(0),
-                'attention_mask': encoding['attention_mask'].squeeze(0),
                 'type': 'text'
             }
         except Exception as e:
@@ -161,8 +174,6 @@ class PreTrainDataset(IterableDataset):
             if not text or image is None:
                 return None
-            # 处理图像
             if isinstance(image, str):
                 try:
                     response = requests.get(image, timeout=5)
@@ -174,11 +185,8 @@ class PreTrainDataset(IterableDataset):
                 image = image.convert('RGB')
             else:
                 return None
-            # 转换图像
             image_tensor = image_transform(image)
-            # Tokenize文本
             encoding = self.tokenizer(
                 text,
                 max_length=self.max_length,
@@ -198,34 +206,26 @@ class PreTrainDataset(IterableDataset):
             return None
     def __iter__(self):
-        """迭代器"""
         worker_info = torch.utils.data.get_worker_info()
         if worker_info is not None:
-            # 多worker时设置不同的随机种子
             random.seed(self.seed + worker_info.id)
             np.random.seed(self.seed + worker_info.id)
         else:
             random.seed(self.seed)
             np.random.seed(self.seed)
-        # 创建数据集迭代器
         iterators = [iter(ds) for _, ds, _ in self.datasets]
         self.samples_generated = 0
         while True:
-            # 检查是否达到最大样本数
             if self.max_samples and self.samples_generated >= self.max_samples:
                 break
             try:
-                # 根据概率选择数据集
                 idx = np.random.choice(len(self.datasets), p=self.probabilities)
                 name, _, config = self.datasets[idx]
-                # 从选中的数据集获取样本
                 sample = next(iterators[idx])
-                # 处理样本
                 processed = None
                 if config.get('type') in ['text', 'code']:
                     processed = self._process_text_sample(sample, config)
@@ -240,7 +240,6 @@ class PreTrainDataset(IterableDataset):
                     yield processed
             except StopIteration:
-                # 重新创建迭代器
                 try:
                     iterators[idx] = iter(self.datasets[idx][1])
                 except Exception as e:
@@ -269,7 +268,6 @@ class PostTrainDataset(Dataset):
         self.max_length = max_length
         self.split = split
-        # 获取混合配置
         if mix_name not in POSTTRAIN_MIX:
             raise ValueError(f"Unknown mix: {mix_name}. Available: {list(POSTTRAIN_MIX.keys())}")
@@ -283,7 +281,6 @@ class PostTrainDataset(Dataset):
         logger.info(f"Loading posttrain mix: {mix_name}")
         logger.info(f"  Datasets: {dataset_names}")
-        # 加载和合并数据集
         all_datasets = []
         for name in dataset_names:
@@ -306,14 +303,12 @@ class PostTrainDataset(Dataset):
                 ds = load_dataset(**load_kwargs)
-                # 限制样本数
                 if config.get('max_samples'):
                     if hasattr(ds, 'take'):
                         ds = ds.take(config['max_samples'])
                     elif hasattr(ds, 'select'):
                         ds = ds.select(range(min(len(ds), config['max_samples'])))
-                # 添加数据集标识
                 def add_source(example):
                     example['_source'] = name
                     example['_config'] = config
@@ -329,14 +324,12 @@ class PostTrainDataset(Dataset):
                 logger.error(f"Error loading {name}: {e}")
                 continue
-        # 合并数据集
         if not all_datasets:
             raise ValueError("No datasets loaded successfully")
         if len(all_datasets) == 1:
             self.dataset = all_datasets[0]
         else:
-            # 交织数据集
             probabilities = [w / sum(weights[:len(all_datasets)])
                            for w in weights[:len(all_datasets)]]
             self.dataset = interleave_datasets(
@@ -345,8 +338,6 @@ class PostTrainDataset(Dataset):
                 seed=42,
                 stopping_strategy='all_exhausted'
             )
-        # 限制总样本数
         if max_samples and hasattr(self.dataset, '__len__'):
             actual_len = min(len(self.dataset), max_samples)
             self.dataset = self.dataset.select(range(actual_len))
@@ -355,7 +346,6 @@ class PostTrainDataset(Dataset):
         logger.info(f"Total samples: {dataset_len}")
     def _format_instruction(self, sample: Dict, config: Dict) -> str:
-        """格式化instruction"""
         try:
             data_type = config.get('type', 'instruction')
@@ -367,8 +357,6 @@ class PostTrainDataset(Dataset):
                 instruction = sample.get(instruction_field, '')
                 input_text = sample.get(input_field, '')
                 context = sample.get(context_field, '') if context_field else ''
-                # 构建prompt
                 prompt_parts = [f"Instruction: {instruction}"]
                 if context:
@@ -385,40 +373,47 @@ class PostTrainDataset(Dataset):
                     conversations = sample['conversations']
                     if isinstance(conversations, list) and len(conversations) > 0:
                         dialogue = []
-                        for conv in conversations[:-1]:
-                            role = conv.get('from', 'user')
-                            content = conv.get('value', '')
                             dialogue.append(f"{role}: {content}")
                         return "\n".join(dialogue) + "\nassistant:"
                 elif 'messages' in sample:
-                    # 标准消息格式
                     messages = sample['messages']
                     if isinstance(messages, list) and len(messages) > 0:
                         dialogue = []
-                        for msg in messages[:-1]:
                             role = msg.get('role', 'user')
                             content = msg.get('content', '')
                             dialogue.append(f"{role}: {content}")
                         return "\n".join(dialogue) + "\nassistant:"
-                # 如果没有标准格式，尝试使用text字段
                 return sample.get('text', '')
             elif data_type == 'code_instruction':
-                # 代码instruction格式
                 instruction_field = config.get('instruction_field', 'instruction')
                 instruction = sample.get(instruction_field, '')
                 return f"### Instruction:\n{instruction}\n### Response:"
             elif data_type == 'multimodal_instruction':
-                # 多模态instruction
                 instruction_field = config.get('instruction_field', 'conversations')
                 conversations = sample.get(instruction_field, [])
                 if isinstance(conversations, list) and len(conversations) > 0:
-                    # 提取对话历史（除了最后一条回复）
                     dialogue = []
-                    for conv in conversations[:-1]:
                         role = conv.get('from', 'user')
                         content = conv.get('value', '')
                         dialogue.append(f"{role}: {content}")
@@ -432,6 +427,7 @@ class PostTrainDataset(Dataset):
             return ""
     def _get_response(self, sample: Dict, config: Dict) -> str:
         try:
             data_type = config.get('type', 'instruction')
@@ -444,20 +440,33 @@ class PostTrainDataset(Dataset):
                 if 'conversations' in sample:
                     conversations = sample['conversations']
                     if isinstance(conversations, list) and len(conversations) > 0:
-                        return conversations[-1].get('value', '')
                 elif 'messages' in sample:
                     messages = sample['messages']
                     if isinstance(messages, list) and len(messages) > 0:
                         return messages[-1].get('content', '')
                 return ""
             elif data_type == 'multimodal_instruction':
                 instruction_field = config.get('instruction_field', 'conversations')
                 conversations = sample.get(instruction_field, [])
                 if isinstance(conversations, list) and len(conversations) > 0:
-                    return conversations[-1].get('value', '')
                 return ""
             else:
@@ -473,75 +482,47 @@ class PostTrainDataset(Dataset):
     def __getitem__(self, idx):
         try:
             sample = self.dataset[idx]
-            # 获取配置
             if '_config' not in sample:
                 logger.warning(f"Sample at index {idx} missing _config")
                 return None
             config = sample['_config']
-            # 格式化 instruction 和 response
             instruction_text = self._format_instruction(sample, config)
             response_text = self._get_response(sample, config)
             if not instruction_text or not response_text:
                 return None
             pad_token_id = self.tokenizer.pad_token_id
             if pad_token_id is None:
                 pad_token_id = self.tokenizer.eos_token_id
-            instruction_max_len = self.max_length // 2
-            # Tokenize 不做 padding，手动处理
             instruction_enc = self.tokenizer(
                 instruction_text,
                 truncation=True,
                 max_length=instruction_max_len,
-                add_special_tokens=False,
-                return_tensors='pt'
             )
-            instr_ids = instruction_enc['input_ids'].squeeze(0)
-            # Instruction 手动 Padding
-            instr_len = instr_ids.size(0)
-            if instr_len < instruction_max_len:
-                padding = torch.full((instruction_max_len - instr_len,), pad_token_id, dtype=torch.long)
-                instr_ids = torch.cat([instr_ids, padding])
-                instr_mask = torch.cat([torch.ones(instr_len, dtype=torch.long), torch.zeros(instruction_max_len - instr_len, dtype=torch.long)])
-            else:
-                instr_mask = torch.ones(instruction_max_len, dtype=torch.long)
-            response_max_len = self.max_length // 2
-            # Tokenize: 预留1个位置给EOS
             response_enc = self.tokenizer(
                 response_text,
                 truncation=True,
                 max_length=response_max_len - 1,
                 add_special_tokens=False,
-                return_tensors='pt'
             )
-            resp_ids = response_enc['input_ids'].squeeze(0)
-            eos_token = torch.tensor([self.tokenizer.eos_token_id], dtype=torch.long)
-            resp_ids = torch.cat([resp_ids, eos_token])
-            # Response 手动 Padding
-            curr_resp_len = resp_ids.size(0)
-            if curr_resp_len < response_max_len:
-                padding = torch.full((response_max_len - curr_resp_len,), pad_token_id, dtype=torch.long)
-                resp_ids = torch.cat([resp_ids, padding])
-                resp_mask = torch.cat([torch.ones(curr_resp_len, dtype=torch.long), torch.zeros(response_max_len - curr_resp_len, dtype=torch.long)])
-            else:
-                resp_mask = torch.ones(response_max_len, dtype=torch.long)
             result = {
                 'instruction': instr_ids,
                 'response': resp_ids,
-                'instruction_mask': instr_mask,
-                'response_mask': resp_mask,
                 'task': sample.get('_source', 'unknown'),
                 'modality_data': None
             }
@@ -564,150 +545,93 @@ class PostTrainDataset(Dataset):
             traceback.print_exc()
             return None
-class PreferenceDataset(Dataset):
-    def __init__(
-        self,
-        dataset_name: str = 'hh_rlhf',
-        tokenizer=None,
-        max_length: int = 1024,
-        max_samples: Optional[int] = None,
-        split: str = 'train'
-    ):
-        super().__init__()
-        if tokenizer is None:
-            raise ValueError("tokenizer cannot be None")
-        self.tokenizer = tokenizer
-        self.max_length = max_length
-        if dataset_name not in POSTTRAIN_DATASETS:
-            raise ValueError(f"Unknown dataset: {dataset_name}. Available: {list(POSTTRAIN_DATASETS.keys())}")
-        config = POSTTRAIN_DATASETS[dataset_name]
-        if config.get('type') != 'preference':
-            raise ValueError(f"{dataset_name} is not a preference dataset (type: {config.get('type')})")
-        logger.info(f"Loading preference dataset: {dataset_name}")
-        load_kwargs = {
-            'path': config['hf_path'],
-            'split': split,
-            'cache_dir': HF_CACHE_DIR,
         }
-        if 'config' in config:
-            load_kwargs['name'] = config['config']
-        self.dataset = load_dataset(**load_kwargs)
-        self.chosen_field = config.get('chosen_field', 'chosen')
-        self.rejected_field = config.get('rejected_field', 'rejected')
-        if max_samples and len(self.dataset) > max_samples:
-            self.dataset = self.dataset.select(range(max_samples))
-        logger.info(f"Loaded {len(self.dataset)} preference pairs")
-    def __len__(self):
-        return len(self.dataset)
-    def __getitem__(self, idx):
-        try:
-            sample = self.dataset[idx]
-            chosen_text = sample.get(self.chosen_field, '')
-            rejected_text = sample.get(self.rejected_field, '')
-            if not chosen_text or not rejected_text:
-                return None
-            # Tokenize
-            chosen_enc = self.tokenizer(
-                chosen_text,
-                max_length=self.max_length,
-                truncation=True,
-                padding='max_length',
-                return_tensors='pt'
-            )
-            rejected_enc = self.tokenizer(
-                rejected_text,
-                max_length=self.max_length,
-                truncation=True,
-                padding='max_length',
-                return_tensors='pt'
-            )
-            return (
-                chosen_enc['input_ids'].squeeze(0),
-                rejected_enc['input_ids'].squeeze(0),
-                chosen_enc['attention_mask'].squeeze(0),
-                rejected_enc['attention_mask'].squeeze(0)
-            )
-        except Exception as e:
-            logger.debug(f"Error getting preference item at index {idx}: {e}")
             return None
-def collate_fn_v2(batch):
-    batch = [item for item in batch if item is not None]
-    if not batch:
-        logger.warning("Empty batch after filtering None values")
-        # 返回一个空的占位batch而不是None
-        return {
-            'input_ids': torch.empty(0),
-            'attention_mask': torch.empty(0)
-        }
-    # 检查是否是preference数据
-    if isinstance(batch[0], tuple):
-        if len(batch[0]) == 4:  # 包含attention_mask
-            chosen = torch.stack([item[0] for item in batch])
-            rejected = torch.stack([item[1] for item in batch])
-            chosen_mask = torch.stack([item[2] for item in batch])
-            rejected_mask = torch.stack([item[3] for item in batch])
-            return {
-                'chosen': chosen,
-                'rejected': rejected,
-                'chosen_mask': chosen_mask,
-                'rejected_mask': rejected_mask
-            }
-        else:
-            chosen = torch.stack([item[0] for item in batch])
-            rejected = torch.stack([item[1] for item in batch])
-            return {'chosen': chosen, 'rejected': rejected}
-    keys = batch[0].keys()
-    collated = {}
-    for key in keys:
-        if key in ['instruction', 'response', 'instruction_mask',
-                   'response_mask', 'input_ids', 'attention_mask']:
-            tensors = [item[key] for item in batch if item.get(key) is not None]
-            if tensors:
-                collated[key] = torch.stack(tensors)
-            else:
-                collated[key] = None
-        elif key == 'modality_data':
-            # 处理多模态数据
-            modality_list = [item[key] for item in batch if item.get(key) is not None]
-            if modality_list and any(m is not None for m in modality_list):
-                # 收集图像
-                images = [m.get('image') for m in modality_list if m and 'image' in m]
-                if images:
-                    collated[key] = {'image': torch.stack(images)}
-                else:
-                    collated[key] = None
             else:
-                collated[key] = None
         else:
-            collated[key] = [item[key] for item in batch]
-    return collated
 def create_pretrain_dataloader(
@@ -718,18 +642,26 @@ def create_pretrain_dataloader(
     max_length: int = 2048,
     max_samples: Optional[int] = None
 ):
     dataset = PreTrainDataset(
         mix_name=mix_name,
         tokenizer=tokenizer,
         max_length=max_length,
-        streaming=True,
         max_samples=max_samples
     )
     return DataLoader(
         dataset,
         batch_size=batch_size,
         num_workers=num_workers,
-        collate_fn=collate_fn_v2
     )
@@ -743,6 +675,10 @@ def create_posttrain_dataloader(
     split: str = 'train',
     shuffle: bool = True
 ):
     dataset = PostTrainDataset(
         mix_name=mix_name,
         tokenizer=tokenizer,
@@ -750,14 +686,16 @@ def create_posttrain_dataloader(
         max_samples=max_samples,
         split=split
     )
     return DataLoader(
         dataset,
         batch_size=batch_size,
         shuffle=shuffle,
         num_workers=num_workers,
-        collate_fn=collate_fn_v2,
         pin_memory=True,
-        drop_last=False
     )
@@ -783,6 +721,6 @@ def create_preference_dataloader(
         batch_size=batch_size,
         shuffle=shuffle,
         num_workers=num_workers,
-        collate_fn=collate_fn_v2,
         pin_memory=True
     )

 from io import BytesIO
 from torchvision import transforms
 import logging
+import os
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
     DATASET_CACHE_DIR,
     HF_CACHE_DIR
 )
 image_transform = transforms.Compose([
     transforms.Resize((224, 224)),
     transforms.ToTensor(),
         self.max_samples = max_samples
         self.samples_generated = 0
         if mix_name not in PRETRAIN_MIX:
             raise ValueError(f"Unknown mix: {mix_name}. Available: {list(PRETRAIN_MIX.keys())}")
         if not dataset_names:
             raise ValueError(f"No datasets found in mix: {mix_name}")
         self.datasets = []
         self.probabilities = []
         if not self.datasets:
             raise ValueError("No datasets loaded successfully")
         total = sum(self.probabilities)
         self.probabilities = [p / total for p in self.probabilities]
                 'streaming': config.get('streaming', self.streaming),
                 'cache_dir': HF_CACHE_DIR,
             }
+            if 'data_files' in config:
+                files = config['data_files']
+                if isinstance(files, list):
+                    for f in files:
+                        if not os.path.exists(f):
+                            logger.error(f" Data file not found in list: {f}")
+                            return None
+                    logger.info(f" Verified {len(files)} local files.")
+                elif isinstance(files, str):
+                    if not os.path.exists(files):
+                        logger.error(f" Data file not found: {files}")
+                        return None
+                    logger.info(f" Verified local file: {files}")
+                load_kwargs['data_files'] = files
             if 'config' in config:
                 load_kwargs['name'] = config['config']
+            logger.info(f"   Loading HF dataset: {config['hf_path']}...")
             ds = load_dataset(**load_kwargs)
             return ds
         except Exception as e:
             logger.error(f"Failed to load {config.get('hf_path', 'unknown')}: {e}")
+            import traceback
+            traceback.print_exc()
             return None
     def _process_text_sample(self, sample: Dict, config: Dict) -> Optional[Dict]:
                 return None
             text = text.strip()
+            if len(text) < 10:
                 return None
+            max_input_len = self.max_length - 1
             encoding = self.tokenizer(
                 text,
+                max_length=max_input_len,
                 truncation=True,
+                padding=False,
+                add_special_tokens=False,
+                return_tensors=None
             )
+            input_ids = encoding['input_ids']
+            input_ids.append(self.tokenizer.eos_token_id)
+            input_ids_tensor = torch.tensor(input_ids, dtype=torch.long)
             return {
+                'input_ids': input_ids_tensor,
                 'type': 'text'
             }
         except Exception as e:
             if not text or image is None:
                 return None
             if isinstance(image, str):
                 try:
                     response = requests.get(image, timeout=5)
                 image = image.convert('RGB')
             else:
                 return None
             image_tensor = image_transform(image)
             encoding = self.tokenizer(
                 text,
                 max_length=self.max_length,
             return None
     def __iter__(self):
         worker_info = torch.utils.data.get_worker_info()
         if worker_info is not None:
             random.seed(self.seed + worker_info.id)
             np.random.seed(self.seed + worker_info.id)
         else:
             random.seed(self.seed)
             np.random.seed(self.seed)
         iterators = [iter(ds) for _, ds, _ in self.datasets]
         self.samples_generated = 0
         while True:
             if self.max_samples and self.samples_generated >= self.max_samples:
                 break
             try:
                 idx = np.random.choice(len(self.datasets), p=self.probabilities)
                 name, _, config = self.datasets[idx]
                 sample = next(iterators[idx])
                 processed = None
                 if config.get('type') in ['text', 'code']:
                     processed = self._process_text_sample(sample, config)
                     yield processed
             except StopIteration:
                 try:
                     iterators[idx] = iter(self.datasets[idx][1])
                 except Exception as e:
         self.max_length = max_length
         self.split = split
         if mix_name not in POSTTRAIN_MIX:
             raise ValueError(f"Unknown mix: {mix_name}. Available: {list(POSTTRAIN_MIX.keys())}")
         logger.info(f"Loading posttrain mix: {mix_name}")
         logger.info(f"  Datasets: {dataset_names}")
         all_datasets = []
         for name in dataset_names:
                 ds = load_dataset(**load_kwargs)
                 if config.get('max_samples'):
                     if hasattr(ds, 'take'):
                         ds = ds.take(config['max_samples'])
                     elif hasattr(ds, 'select'):
                         ds = ds.select(range(min(len(ds), config['max_samples'])))
                 def add_source(example):
                     example['_source'] = name
                     example['_config'] = config
                 logger.error(f"Error loading {name}: {e}")
                 continue
         if not all_datasets:
             raise ValueError("No datasets loaded successfully")
         if len(all_datasets) == 1:
             self.dataset = all_datasets[0]
         else:
             probabilities = [w / sum(weights[:len(all_datasets)])
                            for w in weights[:len(all_datasets)]]
             self.dataset = interleave_datasets(
                 seed=42,
                 stopping_strategy='all_exhausted'
             )
         if max_samples and hasattr(self.dataset, '__len__'):
             actual_len = min(len(self.dataset), max_samples)
             self.dataset = self.dataset.select(range(actual_len))
         logger.info(f"Total samples: {dataset_len}")
     def _format_instruction(self, sample: Dict, config: Dict) -> str:
         try:
             data_type = config.get('type', 'instruction')
                 instruction = sample.get(instruction_field, '')
                 input_text = sample.get(input_field, '')
                 context = sample.get(context_field, '') if context_field else ''
                 prompt_parts = [f"Instruction: {instruction}"]
                 if context:
                     conversations = sample['conversations']
                     if isinstance(conversations, list) and len(conversations) > 0:
                         dialogue = []
+                        last_role = conversations[-1].get('role', conversations[-1].get('from', 'user')).lower()
+                        upto = len(conversations)
+                        if last_role == 'assistant':
+                            upto = len(conversations) - 1
+                        for conv in conversations[:upto]:
+                            role = conv.get('role', conv.get('from', 'user'))
+                            content = conv.get('content', conv.get('value', ''))
                             dialogue.append(f"{role}: {content}")
                         return "\n".join(dialogue) + "\nassistant:"
                 elif 'messages' in sample:
                     messages = sample['messages']
                     if isinstance(messages, list) and len(messages) > 0:
                         dialogue = []
+                        last_role = messages[-1].get('role', 'user').lower()
+                        upto = len(messages)
+                        if last_role == 'assistant':
+                            upto = len(messages) - 1
+                        for msg in messages[:upto]:
                             role = msg.get('role', 'user')
                             content = msg.get('content', '')
                             dialogue.append(f"{role}: {content}")
                         return "\n".join(dialogue) + "\nassistant:"
                 return sample.get('text', '')
             elif data_type == 'code_instruction':
                 instruction_field = config.get('instruction_field', 'instruction')
                 instruction = sample.get(instruction_field, '')
                 return f"### Instruction:\n{instruction}\n### Response:"
             elif data_type == 'multimodal_instruction':
                 instruction_field = config.get('instruction_field', 'conversations')
                 conversations = sample.get(instruction_field, [])
                 if isinstance(conversations, list) and len(conversations) > 0:
                     dialogue = []
+                    last_role = conversations[-1].get('from', 'user').lower() if isinstance(conversations[-1].get('from', 'user'), str) else 'user'
+                    upto = len(conversations)
+                    if last_role == 'assistant':
+                        upto = len(conversations) - 1
+                    for conv in conversations[:upto]:
                         role = conv.get('from', 'user')
                         content = conv.get('value', '')
                         dialogue.append(f"{role}: {content}")
             return ""
     def _get_response(self, sample: Dict, config: Dict) -> str:
+        """获取响应（兼容 <think>/<answer> 标签）"""
         try:
             data_type = config.get('type', 'instruction')
                 if 'conversations' in sample:
                     conversations = sample['conversations']
                     if isinstance(conversations, list) and len(conversations) > 0:
+                        last_turn = conversations[-1]
+                        content = last_turn.get('content', last_turn.get('value', ''))
+                        if not isinstance(content, str):
+                            return ''
+                        # 仅当最后一条 role 为 assistant 时返回
+                        role = last_turn.get('role', last_turn.get('from', '')).lower()
+                        if role != 'assistant':
+                            return ''
+                        return str(content).strip() if content else ""
                 elif 'messages' in sample:
                     messages = sample['messages']
                     if isinstance(messages, list) and len(messages) > 0:
                         return messages[-1].get('content', '')
                 return ""
             elif data_type == 'multimodal_instruction':
                 instruction_field = config.get('instruction_field', 'conversations')
                 conversations = sample.get(instruction_field, [])
                 if isinstance(conversations, list) and len(conversations) > 0:
+                    last = conversations[-1].get('value', '')
+                    import re
+                    m = re.search(r'<answer>([\\s\\S]*?)</answer>', last, re.IGNORECASE)
+                    if m:
+                        return m.group(1).strip()
+                    return re.sub(r'<think>[\\s\\S]*?</think>', '', last, flags=re.IGNORECASE).strip()
                 return ""
             else:
     def __getitem__(self, idx):
         try:
             sample = self.dataset[idx]
             if '_config' not in sample:
                 logger.warning(f"Sample at index {idx} missing _config")
                 return None
             config = sample['_config']
             instruction_text = self._format_instruction(sample, config)
             response_text = self._get_response(sample, config)
             if not instruction_text or not response_text:
                 return None
             pad_token_id = self.tokenizer.pad_token_id
             if pad_token_id is None:
                 pad_token_id = self.tokenizer.eos_token_id
+            instruction_max_len = 256
             instruction_enc = self.tokenizer(
                 instruction_text,
                 truncation=True,
                 max_length=instruction_max_len,
+                add_special_tokens=False,
+                return_tensors=None
             )
+            instr_ids_list = instruction_enc['input_ids']
+            instr_ids = torch.tensor(instr_ids_list, dtype=torch.long)
+            response_max_len = self.max_length - len(instr_ids)
             response_enc = self.tokenizer(
                 response_text,
                 truncation=True,
                 max_length=response_max_len - 1,
                 add_special_tokens=False,
+                return_tensors=None
             )
+            resp_ids_list = response_enc['input_ids']
+            resp_ids_list = resp_ids_list + [self.tokenizer.eos_token_id]
+            resp_ids = torch.tensor(resp_ids_list, dtype=torch.long)
             result = {
                 'instruction': instr_ids,
                 'response': resp_ids,
                 'task': sample.get('_source', 'unknown'),
                 'modality_data': None
             }
             traceback.print_exc()
             return None
+from torch.nn.utils.rnn import pad_sequence
+class DynamicCollate:
+    def __init__(self, pad_token_id: int):
+        self.pad_token_id = pad_token_id
+    def __call__(self, batch):
+        batch = [item for item in batch if item is not None]
+        if not batch:
+            return {
+                'input_ids': torch.empty(0),
+                'attention_mask': torch.empty(0)
+            }
+        input_ids_list = [item['input_ids'] for item in batch]
+        padded_input_ids = pad_sequence(
+            input_ids_list,
+            batch_first=True,
+            padding_value=self.pad_token_id
+        )
+        attention_mask = (padded_input_ids != self.pad_token_id).long()
+        return {
+            'input_ids': padded_input_ids,
+            'attention_mask': attention_mask
         }
+def collate_fn_v2_factory(pad_token_id: int):
+    def collate_fn_v2(batch):
+        batch = [item for item in batch if item is not None]
+        if not batch:
+            logger.warning("Empty batch after filtering None values")
             return None
+        if isinstance(batch[0], tuple):
+            if len(batch[0]) == 4:
+                chosen = torch.stack([item[0] for item in batch])
+                rejected = torch.stack([item[1] for item in batch])
+                chosen_mask = torch.stack([item[2] for item in batch])
+                rejected_mask = torch.stack([item[3] for item in batch])
+                return {
+                    'chosen': chosen,
+                    'rejected': rejected,
+                    'chosen_mask': chosen_mask,
+                    'rejected_mask': rejected_mask
+                }
+            else:
+                chosen = torch.stack([item[0] for item in batch])
+                rejected = torch.stack([item[1] for item in batch])
+                return {'chosen': chosen, 'rejected': rejected}
+        collated = {}
+        instr_list = [item['instruction'] for item in batch if item.get('instruction') is not None]
+        if instr_list:
+            padded_instr = pad_sequence(instr_list, batch_first=True, padding_value=pad_token_id)
+            instr_mask = (padded_instr != pad_token_id).long()
+            collated['instruction'] = padded_instr
+            collated['instruction_mask'] = instr_mask
+        else:
+            collated['instruction'] = None
+            collated['instruction_mask'] = None
+        resp_list = [item['response'] for item in batch if item.get('response') is not None]
+        if resp_list:
+            padded_resp = pad_sequence(resp_list, batch_first=True, padding_value=pad_token_id)
+            resp_mask = (padded_resp != pad_token_id).long()
+            collated['response'] = padded_resp
+            collated['response_mask'] = resp_mask
+        else:
+            collated['response'] = None
+            collated['response_mask'] = None
+        modality_list = [item.get('modality_data') for item in batch if item.get('modality_data') is not None]
+        if modality_list and any(m is not None for m in modality_list):
+            images = [m.get('image') for m in modality_list if m and 'image' in m]
+            if images:
+                collated['modality_data'] = {'image': torch.stack(images)}
             else:
+                collated['modality_data'] = None
         else:
+            collated['modality_data'] = None
+        collated['task'] = [item.get('task', 'unknown') for item in batch]
+        return collated
+    return collate_fn_v2
 def create_pretrain_dataloader(
     max_length: int = 2048,
     max_samples: Optional[int] = None
 ):
+    if tokenizer.pad_token_id is None:
+        tokenizer.pad_token_id = tokenizer.eos_token_id
     dataset = PreTrainDataset(
         mix_name=mix_name,
         tokenizer=tokenizer,
         max_length=max_length,
+        streaming=True,
         max_samples=max_samples
     )
+    collate_fn = DynamicCollate(pad_token_id=tokenizer.pad_token_id)
     return DataLoader(
         dataset,
         batch_size=batch_size,
         num_workers=num_workers,
+        collate_fn=collate_fn,
+        pin_memory=True
     )
     split: str = 'train',
     shuffle: bool = True
 ):
+    if tokenizer.pad_token_id is None:
+        tokenizer.pad_token_id = tokenizer.eos_token_id
     dataset = PostTrainDataset(
         mix_name=mix_name,
         tokenizer=tokenizer,
         max_samples=max_samples,
         split=split
     )
+    collate_fn = collate_fn_v2_factory(pad_token_id=tokenizer.pad_token_id)
     return DataLoader(
         dataset,
         batch_size=batch_size,
         shuffle=shuffle,
         num_workers=num_workers,
+        collate_fn=collate_fn,
         pin_memory=True,
+        drop_last=False
     )
         batch_size=batch_size,
         shuffle=shuffle,
         num_workers=num_workers,
+        collate_fn=collate_fn_v2_factory(pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id),
         pin_memory=True
     )