b-re-w
/

Midm-2.0-Mini-Tuning

Model card Files Files and versions

xet

Community

b-re-w commited on Sep 22, 2025

Commit

3e04ca0

verified ·

1 Parent(s): a6bb111

Upload merge.py

Browse files

Files changed (1) hide show

merge.py +375 -0

merge.py ADDED Viewed

	@@ -0,0 +1,375 @@

+import torch
+import torch.distributed.tensor
+from safetensors.torch import save_file
+import os
+from collections import OrderedDict
+import gc
+def merge_fsdp_to_safetensors(rank0_path, rank1_path, output_path, target_dtype=None):
+    """
+    FSDP로 분할된 두 개의 .pt 파일을 하나의 .safetensors 파일로 병합
+    Args:
+        rank0_path (str): rank 0 .pt 파일 경로
+        rank1_path (str): rank 1 .pt 파일 경로
+        output_path (str): 출력할 .safetensors 파일 경로
+        target_dtype (torch.dtype, optional): 타겟 dtype (예: torch.float16, torch.bfloat16)
+    """
+    print("Loading rank 0 checkpoint...")
+    rank0_dict = torch.load(rank0_path, map_location='cpu', weights_only=False)
+    print("Loading rank 1 checkpoint...")
+    rank1_dict = torch.load(rank1_path, map_location='cpu', weights_only=False)
+    # DTensor를 일반 텐서로 변환하는 함수
+    def convert_dtensor_to_tensor(state_dict):
+        converted_dict = OrderedDict()
+        dtype_info = {}
+        for key, value in state_dict.items():
+            if hasattr(value, '_local_tensor'):
+                # DTensor인 경우 로컬 텐서 추출
+                tensor = value._local_tensor
+                converted_dict[key] = tensor
+                dtype_info[key] = tensor.dtype
+                print(f"Converted DTensor to tensor: {key} (dtype: {tensor.dtype})")
+            elif isinstance(value, torch.Tensor):
+                converted_dict[key] = value
+                dtype_info[key] = value.dtype
+            else:
+                # 다른 타입은 그대로 유지
+                converted_dict[key] = value
+                dtype_info[key] = type(value).__name__
+        return converted_dict, dtype_info
+    print("Converting DTensors to regular tensors...")
+    rank0_dict, rank0_dtypes = convert_dtensor_to_tensor(rank0_dict)
+    rank1_dict, rank1_dtypes = convert_dtensor_to_tensor(rank1_dict)
+    # dtype 정보 출력
+    print("\n📋 Original dtype information:")
+    all_dtypes_r0 = set(dtype_info for dtype_info in rank0_dtypes.values() if isinstance(dtype_info, torch.dtype))
+    all_dtypes_r1 = set(dtype_info for dtype_info in rank1_dtypes.values() if isinstance(dtype_info, torch.dtype))
+    all_dtypes = all_dtypes_r0 | all_dtypes_r1
+    print(f"   Rank 0 dtypes found: {all_dtypes_r0}")
+    print(f"   Rank 1 dtypes found: {all_dtypes_r1}")
+    print(f"   All dtypes: {all_dtypes}")
+    if target_dtype:
+        print(f"   Target dtype specified: {target_dtype}")
+    else:
+        print("   No target dtype specified - keeping original dtypes")
+    # 병합된 상태 사전
+    merged_state_dict = OrderedDict()
+    # rank 0의 모든 키들을 먼저 처리
+    all_keys = set(rank0_dict.keys()) | set(rank1_dict.keys())
+    print(f"Total unique keys found: {len(all_keys)}")
+    for key in sorted(all_keys):
+        rank0_tensor = rank0_dict.get(key)
+        rank1_tensor = rank1_dict.get(key)
+        if rank0_tensor is not None and rank1_tensor is not None:
+            # 두 rank에 모두 존재하는 경우 - 차원 확인 후 연결
+            print(f"Merging key: {key}")
+            print(f"  Rank 0 shape: {rank0_tensor.shape}, dtype: {rank0_tensor.dtype}")
+            print(f"  Rank 1 shape: {rank1_tensor.shape}, dtype: {rank1_tensor.dtype}")
+            # dtype 변환 (필요한 경우)
+            if target_dtype and rank0_tensor.dtype != target_dtype:
+                rank0_tensor = rank0_tensor.to(target_dtype)
+                print(f"  Converted rank 0 to {target_dtype}")
+            if target_dtype and rank1_tensor.dtype != target_dtype:
+                rank1_tensor = rank1_tensor.to(target_dtype)
+                print(f"  Converted rank 1 to {target_dtype}")
+            # 첫 번째 차원으로 연결 (일반적인 FSDP 샤딩 방식)
+            merged_tensor = torch.cat([rank0_tensor, rank1_tensor], dim=0)
+            merged_state_dict[key] = merged_tensor
+            print(f"  Merged shape: {merged_tensor.shape}, dtype: {merged_tensor.dtype}")
+        elif rank0_tensor is not None:
+            # rank 0에만 존재
+            tensor = rank0_tensor
+            if target_dtype and isinstance(tensor, torch.Tensor) and tensor.dtype != target_dtype:
+                tensor = tensor.to(target_dtype)
+                print(f"Converting {key} from rank 0: {rank0_tensor.dtype} -> {target_dtype}")
+            print(f"Adding from rank 0: {key} (shape: {tensor.shape if isinstance(tensor, torch.Tensor) else 'N/A'}, dtype: {tensor.dtype if isinstance(tensor, torch.Tensor) else type(tensor).__name__})")
+            merged_state_dict[key] = tensor
+        elif rank1_tensor is not None:
+            # rank 1에만 존재
+            tensor = rank1_tensor
+            if target_dtype and isinstance(tensor, torch.Tensor) and tensor.dtype != target_dtype:
+                tensor = tensor.to(target_dtype)
+                print(f"Converting {key} from rank 1: {rank1_tensor.dtype} -> {target_dtype}")
+            print(f"Adding from rank 1: {key} (shape: {tensor.shape if isinstance(tensor, torch.Tensor) else 'N/A'}, dtype: {tensor.dtype if isinstance(tensor, torch.Tensor) else type(tensor).__name__})")
+            merged_state_dict[key] = tensor
+    print(f"\nTotal merged parameters: {len(merged_state_dict)}")
+    # 메모리 정리
+    del rank0_dict, rank1_dict
+    gc.collect()
+    # safetensors로 저장
+    print(f"Saving merged model to {output_path}...")
+    # 최종 dtype 정보 출력
+    final_dtypes = {}
+    for key, tensor in merged_state_dict.items():
+        if isinstance(tensor, torch.Tensor):
+            final_dtypes[tensor.dtype] = final_dtypes.get(tensor.dtype, 0) + 1
+    print(f"📋 Final merged model dtype distribution:")
+    for dtype, count in final_dtypes.items():
+        print(f"   {dtype}: {count} tensors")
+    save_file(merged_state_dict, output_path)
+    print("✅ Successfully saved merged model!")
+    return merged_state_dict
+def merge_with_custom_concatenation(rank0_path, rank1_path, output_path, concat_rules=None):
+    """
+    사용자 정의 연결 규칙으로 병합
+    Args:
+        concat_rules (dict): 키별 연결 차원 지정 {'key_pattern': dim}
+    """
+    if concat_rules is None:
+        # 기본 규칙
+        concat_rules = {
+            'weight': 0,  # 가중치는 첫 번째 차원으로 연결
+            'bias': 0,    # 편향도 첫 번째 차원으로 연결
+        }
+    print("Loading checkpoints...")
+    rank0_dict = torch.load(rank0_path, map_location='cpu', weights_only=False)
+    rank1_dict = torch.load(rank1_path, map_location='cpu', weights_only=False)
+    merged_state_dict = OrderedDict()
+    all_keys = set(rank0_dict.keys()) | set(rank1_dict.keys())
+    for key in sorted(all_keys):
+        rank0_tensor = rank0_dict.get(key)
+        rank1_tensor = rank1_dict.get(key)
+        if rank0_tensor is not None and rank1_tensor is not None:
+            # 연결 차원 결정
+            concat_dim = 0  # 기본값
+            for pattern, dim in concat_rules.items():
+                if pattern in key:
+                    concat_dim = dim
+                    break
+            print(f"Merging {key} along dimension {concat_dim}")
+            merged_tensor = torch.cat([rank0_tensor, rank1_tensor], dim=concat_dim)
+            merged_state_dict[key] = merged_tensor
+        elif rank0_tensor is not None:
+            merged_state_dict[key] = rank0_tensor
+        elif rank1_tensor is not None:
+            merged_state_dict[key] = rank1_tensor
+    # 정리 및 저장
+    del rank0_dict, rank1_dict
+    gc.collect()
+    print(f"Saving to {output_path}...")
+    save_file(merged_state_dict, output_path)
+    print("✅ Merge completed!")
+def comprehensive_verification(rank0_path, rank1_path, merged_path):
+    """병합이 올바르게 되었는지 종합적으로 검증"""
+    import torch.distributed.tensor
+    from safetensors import safe_open
+    print("🔍 Starting comprehensive verification...")
+    # 1. 원본 파일들 로드
+    print("\n📁 Loading original files...")
+    rank0_dict = torch.load(rank0_path, map_location='cpu', weights_only=False)
+    rank1_dict = torch.load(rank1_path, map_location='cpu', weights_only=False)
+    # DTensor를 일반 텐서로 변환
+    def convert_dtensor_to_tensor(state_dict):
+        converted_dict = {}
+        for key, value in state_dict.items():
+            if hasattr(value, '_local_tensor'):
+                converted_dict[key] = value._local_tensor
+            elif isinstance(value, torch.Tensor):
+                converted_dict[key] = value
+            else:
+                converted_dict[key] = value
+        return converted_dict
+    rank0_dict = convert_dtensor_to_tensor(rank0_dict)
+    rank1_dict = convert_dtensor_to_tensor(rank1_dict)
+    # 2. 원본 파일들 분석
+    rank0_keys = set(rank0_dict.keys())
+    rank1_keys = set(rank1_dict.keys())
+    all_original_keys = rank0_keys | rank1_keys
+    shared_keys = rank0_keys & rank1_keys
+    rank0_only = rank0_keys - rank1_keys
+    rank1_only = rank1_keys - rank0_keys
+    print(f"📊 Original files analysis:")
+    print(f"   Rank 0 keys: {len(rank0_keys)}")
+    print(f"   Rank 1 keys: {len(rank1_keys)}")
+    print(f"   Shared keys: {len(shared_keys)}")
+    print(f"   Rank 0 only: {len(rank0_only)}")
+    print(f"   Rank 1 only: {len(rank1_only)}")
+    print(f"   Total unique keys: {len(all_original_keys)}")
+    # 3. 원본 파라미터 수 계산
+    original_params = 0
+    original_shapes = {}
+    for key in all_original_keys:
+        if key in shared_keys:
+            # 공유 키는 두 텐서를 연결한 크기로 계산
+            r0_tensor = rank0_dict[key]
+            r1_tensor = rank1_dict[key]
+            combined_shape = list(r0_tensor.shape)
+            combined_shape[0] += r1_tensor.shape[0]  # 첫 번째 차원으로 연결 가정
+            original_shapes[key] = tuple(combined_shape)
+            original_params += r0_tensor.numel() + r1_tensor.numel()
+        elif key in rank0_only:
+            original_shapes[key] = rank0_dict[key].shape
+            original_params += rank0_dict[key].numel()
+        elif key in rank1_only:
+            original_shapes[key] = rank1_dict[key].shape
+            original_params += rank1_dict[key].numel()
+    print(f"   Original total parameters: {original_params:,}")
+    # 4. 병합된 파일 분석
+    print(f"\n📁 Loading merged file: {merged_path}")
+    merged_params = 0
+    merged_keys = set()
+    merged_shapes = {}
+    with safe_open(merged_path, framework="pt", device="cpu") as f:
+        merged_keys = set(f.keys())
+        for key in f.keys():
+            tensor = f.get_tensor(key)
+            merged_shapes[key] = tensor.shape
+            merged_params += tensor.numel()
+    print(f"📊 Merged file analysis:")
+    print(f"   Merged keys: {len(merged_keys)}")
+    print(f"   Merged parameters: {merged_params:,}")
+    # 5. 비교 및 검증
+    print(f"\n✅ Verification Results:")
+    # 키 개수 비교
+    keys_match = len(merged_keys) == len(all_original_keys)
+    print(f"   Keys count match: {keys_match} ({len(merged_keys)} vs {len(all_original_keys)})")
+    # 파라미터 수 비교
+    params_match = merged_params == original_params
+    print(f"   Parameter count match: {params_match} ({merged_params:,} vs {original_params:,})")
+    # 키 이름 비교
+    missing_keys = all_original_keys - merged_keys
+    extra_keys = merged_keys - all_original_keys
+    if missing_keys:
+        print(f"   ❌ Missing keys: {missing_keys}")
+    if extra_keys:
+        print(f"   ❌ Extra keys: {extra_keys}")
+    # 개별 텐서 크기 비교
+    shape_mismatches = []
+    for key in merged_keys & all_original_keys:
+        if merged_shapes[key] != original_shapes[key]:
+            shape_mismatches.append((key, merged_shapes[key], original_shapes[key]))
+    if shape_mismatches:
+        print(f"   ❌ Shape mismatches:")
+        for key, merged_shape, original_shape in shape_mismatches[:5]:  # 처음 5개만 표시
+            print(f"      {key}: {merged_shape} vs {original_shape}")
+        if len(shape_mismatches) > 5:
+            print(f"      ... and {len(shape_mismatches) - 5} more")
+    # 6. 세부 분석 (선택적)
+    print(f"\n📋 Detailed Analysis:")
+    print(f"   Shared keys that should be concatenated:")
+    for key in sorted(list(shared_keys))[:10]:  # 처음 10개만 표시
+        r0_shape = rank0_dict[key].shape
+        r1_shape = rank1_dict[key].shape
+        expected_shape = list(r0_shape)
+        expected_shape[0] += r1_shape[0]
+        actual_shape = merged_shapes.get(key, "MISSING")
+        status = "✅" if tuple(expected_shape) == actual_shape else "❌"
+        print(f"      {status} {key}: {r0_shape} + {r1_shape} -> {actual_shape}")
+    if len(shared_keys) > 10:
+        print(f"      ... and {len(shared_keys) - 10} more shared keys")
+    # 7. 최종 결과
+    overall_success = keys_match and params_match and not missing_keys and not extra_keys and not shape_mismatches
+    print(f"\n{'='*50}")
+    if overall_success:
+        print("🎉 MERGE VERIFICATION SUCCESSFUL!")
+        print("   All parameters have been correctly merged.")
+    else:
+        print("⚠️  MERGE VERIFICATION FOUND ISSUES!")
+        print("   Please review the mismatches above.")
+    print(f"{'='*50}")
+    # 정리
+    del rank0_dict, rank1_dict
+    gc.collect()
+    return overall_success
+# 사용 예시
+if __name__ == "__main__":
+    # 파일 경로 설정
+    rank0_file = "model_rank_0.pt"  # 실제 파일명으로 변경
+    rank1_file = "model_rank_1.pt"  # 실제 파일명으로 변경
+    output_file = "merged_model.safetensors"
+    # dtype 옵션 설정
+    target_dtype = torch.bfloat16  # bf16으로 변환
+    # 기본 병합
+    print("Starting merge process...")
+    merged_dict = merge_fsdp_to_safetensors(rank0_file, rank1_file, output_file, target_dtype)
+    # 종합적인 검증
+    print("\nStarting comprehensive verification...")
+    verification_passed = comprehensive_verification(rank0_file, rank1_file, output_file)
+    if verification_passed:
+        print(f"\n🎉 Successfully merged and verified FSDP model to {output_file}")
+    else:
+        print(f"\n⚠️  Merge completed but verification found issues. Please review the output above.")
+    # 추가: 간단한 로드 테스트
+    print(f"\n🔍 Testing if merged model can be loaded...")
+    try:
+        from safetensors import safe_open
+        with safe_open(output_file, framework="pt", device="cpu") as f:
+            sample_keys = list(f.keys())[:3]
+            for key in sample_keys:
+                tensor = f.get_tensor(key)
+                print(f"   ✅ Successfully loaded {key}: {tensor.shape}, dtype: {tensor.dtype}")
+        print("   ✅ Merged model loads correctly!")
+    except Exception as e:
+        print(f"   ❌ Error loading merged model: {e}")
+    print(f"\n💡 Tip: To change dtype, modify 'target_dtype' in the script:")
+    print(f"   - torch.float16 for fp16 (smaller file, less precision)")
+    print(f"   - torch.bfloat16 for bf16 (good balance)")
+    print(f"   - torch.float32 for fp32 (larger file, best precision)")
+    print(f"   - None to keep original dtypes")