Upload merge.py

3e04ca0 verified 5 months ago

16 kB

	import torch
	import torch.distributed.tensor
	from safetensors.torch import save_file
	import os
	from collections import OrderedDict
	import gc

	def merge_fsdp_to_safetensors(rank0_path, rank1_path, output_path, target_dtype=None):
	"""
	FSDP로 분할된 두 개의 .pt 파일을 하나의 .safetensors 파일로 병합

	Args:
	rank0_path (str): rank 0 .pt 파일 경로
	rank1_path (str): rank 1 .pt 파일 경로
	output_path (str): 출력할 .safetensors 파일 경로
	target_dtype (torch.dtype, optional): 타겟 dtype (예: torch.float16, torch.bfloat16)
	"""
	print("Loading rank 0 checkpoint...")
	rank0_dict = torch.load(rank0_path, map_location='cpu', weights_only=False)

	print("Loading rank 1 checkpoint...")
	rank1_dict = torch.load(rank1_path, map_location='cpu', weights_only=False)

	# DTensor를 일반 텐서로 변환하는 함수
	def convert_dtensor_to_tensor(state_dict):
	converted_dict = OrderedDict()
	dtype_info = {}
	for key, value in state_dict.items():
	if hasattr(value, '_local_tensor'):
	# DTensor인 경우 로컬 텐서 추출
	tensor = value._local_tensor
	converted_dict[key] = tensor
	dtype_info[key] = tensor.dtype
	print(f"Converted DTensor to tensor: {key} (dtype: {tensor.dtype})")
	elif isinstance(value, torch.Tensor):
	converted_dict[key] = value
	dtype_info[key] = value.dtype
	else:
	# 다른 타입은 그대로 유지
	converted_dict[key] = value
	dtype_info[key] = type(value).__name__
	return converted_dict, dtype_info

	print("Converting DTensors to regular tensors...")
	rank0_dict, rank0_dtypes = convert_dtensor_to_tensor(rank0_dict)
	rank1_dict, rank1_dtypes = convert_dtensor_to_tensor(rank1_dict)

	# dtype 정보 출력
	print("\n📋 Original dtype information:")
	all_dtypes_r0 = set(dtype_info for dtype_info in rank0_dtypes.values() if isinstance(dtype_info, torch.dtype))
	all_dtypes_r1 = set(dtype_info for dtype_info in rank1_dtypes.values() if isinstance(dtype_info, torch.dtype))
	all_dtypes = all_dtypes_r0 \| all_dtypes_r1

	print(f" Rank 0 dtypes found: {all_dtypes_r0}")
	print(f" Rank 1 dtypes found: {all_dtypes_r1}")
	print(f" All dtypes: {all_dtypes}")

	if target_dtype:
	print(f" Target dtype specified: {target_dtype}")
	else:
	print(" No target dtype specified - keeping original dtypes")

	# 병합된 상태 사전
	merged_state_dict = OrderedDict()

	# rank 0의 모든 키들을 먼저 처리
	all_keys = set(rank0_dict.keys()) \| set(rank1_dict.keys())

	print(f"Total unique keys found: {len(all_keys)}")

	for key in sorted(all_keys):
	rank0_tensor = rank0_dict.get(key)
	rank1_tensor = rank1_dict.get(key)

	if rank0_tensor is not None and rank1_tensor is not None:
	# 두 rank에 모두 존재하는 경우 - 차원 확인 후 연결
	print(f"Merging key: {key}")
	print(f" Rank 0 shape: {rank0_tensor.shape}, dtype: {rank0_tensor.dtype}")
	print(f" Rank 1 shape: {rank1_tensor.shape}, dtype: {rank1_tensor.dtype}")

	# dtype 변환 (필요한 경우)
	if target_dtype and rank0_tensor.dtype != target_dtype:
	rank0_tensor = rank0_tensor.to(target_dtype)
	print(f" Converted rank 0 to {target_dtype}")
	if target_dtype and rank1_tensor.dtype != target_dtype:
	rank1_tensor = rank1_tensor.to(target_dtype)
	print(f" Converted rank 1 to {target_dtype}")

	# 첫 번째 차원으로 연결 (일반적인 FSDP 샤딩 방식)
	merged_tensor = torch.cat([rank0_tensor, rank1_tensor], dim=0)
	merged_state_dict[key] = merged_tensor
	print(f" Merged shape: {merged_tensor.shape}, dtype: {merged_tensor.dtype}")

	elif rank0_tensor is not None:
	# rank 0에만 존재
	tensor = rank0_tensor
	if target_dtype and isinstance(tensor, torch.Tensor) and tensor.dtype != target_dtype:
	tensor = tensor.to(target_dtype)
	print(f"Converting {key} from rank 0: {rank0_tensor.dtype} -> {target_dtype}")
	print(f"Adding from rank 0: {key} (shape: {tensor.shape if isinstance(tensor, torch.Tensor) else 'N/A'}, dtype: {tensor.dtype if isinstance(tensor, torch.Tensor) else type(tensor).__name__})")
	merged_state_dict[key] = tensor

	elif rank1_tensor is not None:
	# rank 1에만 존재
	tensor = rank1_tensor
	if target_dtype and isinstance(tensor, torch.Tensor) and tensor.dtype != target_dtype:
	tensor = tensor.to(target_dtype)
	print(f"Converting {key} from rank 1: {rank1_tensor.dtype} -> {target_dtype}")
	print(f"Adding from rank 1: {key} (shape: {tensor.shape if isinstance(tensor, torch.Tensor) else 'N/A'}, dtype: {tensor.dtype if isinstance(tensor, torch.Tensor) else type(tensor).__name__})")
	merged_state_dict[key] = tensor

	print(f"\nTotal merged parameters: {len(merged_state_dict)}")

	# 메모리 정리
	del rank0_dict, rank1_dict
	gc.collect()

	# safetensors로 저장
	print(f"Saving merged model to {output_path}...")

	# 최종 dtype 정보 출력
	final_dtypes = {}
	for key, tensor in merged_state_dict.items():
	if isinstance(tensor, torch.Tensor):
	final_dtypes[tensor.dtype] = final_dtypes.get(tensor.dtype, 0) + 1

	print(f"📋 Final merged model dtype distribution:")
	for dtype, count in final_dtypes.items():
	print(f" {dtype}: {count} tensors")

	save_file(merged_state_dict, output_path)
	print("✅ Successfully saved merged model!")

	return merged_state_dict

	def merge_with_custom_concatenation(rank0_path, rank1_path, output_path, concat_rules=None):
	"""
	사용자 정의 연결 규칙으로 병합

	Args:
	concat_rules (dict): 키별 연결 차원 지정 {'key_pattern': dim}
	"""
	if concat_rules is None:
	# 기본 규칙
	concat_rules = {
	'weight': 0, # 가중치는 첫 번째 차원으로 연결
	'bias': 0, # 편향도 첫 번째 차원으로 연결
	}

	print("Loading checkpoints...")
	rank0_dict = torch.load(rank0_path, map_location='cpu', weights_only=False)
	rank1_dict = torch.load(rank1_path, map_location='cpu', weights_only=False)

	merged_state_dict = OrderedDict()
	all_keys = set(rank0_dict.keys()) \| set(rank1_dict.keys())

	for key in sorted(all_keys):
	rank0_tensor = rank0_dict.get(key)
	rank1_tensor = rank1_dict.get(key)

	if rank0_tensor is not None and rank1_tensor is not None:
	# 연결 차원 결정
	concat_dim = 0 # 기본값
	for pattern, dim in concat_rules.items():
	if pattern in key:
	concat_dim = dim
	break

	print(f"Merging {key} along dimension {concat_dim}")
	merged_tensor = torch.cat([rank0_tensor, rank1_tensor], dim=concat_dim)
	merged_state_dict[key] = merged_tensor

	elif rank0_tensor is not None:
	merged_state_dict[key] = rank0_tensor
	elif rank1_tensor is not None:
	merged_state_dict[key] = rank1_tensor

	# 정리 및 저장
	del rank0_dict, rank1_dict
	gc.collect()

	print(f"Saving to {output_path}...")
	save_file(merged_state_dict, output_path)
	print("✅ Merge completed!")

	def comprehensive_verification(rank0_path, rank1_path, merged_path):
	"""병합이 올바르게 되었는지 종합적으로 검증"""
	import torch.distributed.tensor
	from safetensors import safe_open

	print("🔍 Starting comprehensive verification...")

	# 1. 원본 파일들 로드
	print("\n📁 Loading original files...")
	rank0_dict = torch.load(rank0_path, map_location='cpu', weights_only=False)
	rank1_dict = torch.load(rank1_path, map_location='cpu', weights_only=False)

	# DTensor를 일반 텐서로 변환
	def convert_dtensor_to_tensor(state_dict):
	converted_dict = {}
	for key, value in state_dict.items():
	if hasattr(value, '_local_tensor'):
	converted_dict[key] = value._local_tensor
	elif isinstance(value, torch.Tensor):
	converted_dict[key] = value
	else:
	converted_dict[key] = value
	return converted_dict

	rank0_dict = convert_dtensor_to_tensor(rank0_dict)
	rank1_dict = convert_dtensor_to_tensor(rank1_dict)

	# 2. 원본 파일들 분석
	rank0_keys = set(rank0_dict.keys())
	rank1_keys = set(rank1_dict.keys())
	all_original_keys = rank0_keys \| rank1_keys
	shared_keys = rank0_keys & rank1_keys
	rank0_only = rank0_keys - rank1_keys
	rank1_only = rank1_keys - rank0_keys

	print(f"📊 Original files analysis:")
	print(f" Rank 0 keys: {len(rank0_keys)}")
	print(f" Rank 1 keys: {len(rank1_keys)}")
	print(f" Shared keys: {len(shared_keys)}")
	print(f" Rank 0 only: {len(rank0_only)}")
	print(f" Rank 1 only: {len(rank1_only)}")
	print(f" Total unique keys: {len(all_original_keys)}")

	# 3. 원본 파라미터 수 계산
	original_params = 0
	original_shapes = {}

	for key in all_original_keys:
	if key in shared_keys:
	# 공유 키는 두 텐서를 연결한 크기로 계산
	r0_tensor = rank0_dict[key]
	r1_tensor = rank1_dict[key]
	combined_shape = list(r0_tensor.shape)
	combined_shape[0] += r1_tensor.shape[0] # 첫 번째 차원으로 연결 가정
	original_shapes[key] = tuple(combined_shape)
	original_params += r0_tensor.numel() + r1_tensor.numel()
	elif key in rank0_only:
	original_shapes[key] = rank0_dict[key].shape
	original_params += rank0_dict[key].numel()
	elif key in rank1_only:
	original_shapes[key] = rank1_dict[key].shape
	original_params += rank1_dict[key].numel()

	print(f" Original total parameters: {original_params:,}")

	# 4. 병합된 파일 분석
	print(f"\n📁 Loading merged file: {merged_path}")
	merged_params = 0
	merged_keys = set()
	merged_shapes = {}

	with safe_open(merged_path, framework="pt", device="cpu") as f:
	merged_keys = set(f.keys())
	for key in f.keys():
	tensor = f.get_tensor(key)
	merged_shapes[key] = tensor.shape
	merged_params += tensor.numel()

	print(f"📊 Merged file analysis:")
	print(f" Merged keys: {len(merged_keys)}")
	print(f" Merged parameters: {merged_params:,}")

	# 5. 비교 및 검증
	print(f"\n✅ Verification Results:")

	# 키 개수 비교
	keys_match = len(merged_keys) == len(all_original_keys)
	print(f" Keys count match: {keys_match} ({len(merged_keys)} vs {len(all_original_keys)})")

	# 파라미터 수 비교
	params_match = merged_params == original_params
	print(f" Parameter count match: {params_match} ({merged_params:,} vs {original_params:,})")

	# 키 이름 비교
	missing_keys = all_original_keys - merged_keys
	extra_keys = merged_keys - all_original_keys

	if missing_keys:
	print(f" ❌ Missing keys: {missing_keys}")

	if extra_keys:
	print(f" ❌ Extra keys: {extra_keys}")

	# 개별 텐서 크기 비교
	shape_mismatches = []
	for key in merged_keys & all_original_keys:
	if merged_shapes[key] != original_shapes[key]:
	shape_mismatches.append((key, merged_shapes[key], original_shapes[key]))

	if shape_mismatches:
	print(f" ❌ Shape mismatches:")
	for key, merged_shape, original_shape in shape_mismatches[:5]: # 처음 5개만 표시
	print(f" {key}: {merged_shape} vs {original_shape}")
	if len(shape_mismatches) > 5:
	print(f" ... and {len(shape_mismatches) - 5} more")

	# 6. 세부 분석 (선택적)
	print(f"\n📋 Detailed Analysis:")
	print(f" Shared keys that should be concatenated:")
	for key in sorted(list(shared_keys))[:10]: # 처음 10개만 표시
	r0_shape = rank0_dict[key].shape
	r1_shape = rank1_dict[key].shape
	expected_shape = list(r0_shape)
	expected_shape[0] += r1_shape[0]
	actual_shape = merged_shapes.get(key, "MISSING")
	status = "✅" if tuple(expected_shape) == actual_shape else "❌"
	print(f" {status} {key}: {r0_shape} + {r1_shape} -> {actual_shape}")

	if len(shared_keys) > 10:
	print(f" ... and {len(shared_keys) - 10} more shared keys")

	# 7. 최종 결과
	overall_success = keys_match and params_match and not missing_keys and not extra_keys and not shape_mismatches

	print(f"\n{'='*50}")
	if overall_success:
	print("🎉 MERGE VERIFICATION SUCCESSFUL!")
	print(" All parameters have been correctly merged.")
	else:
	print("⚠️ MERGE VERIFICATION FOUND ISSUES!")
	print(" Please review the mismatches above.")
	print(f"{'='*50}")

	# 정리
	del rank0_dict, rank1_dict
	gc.collect()

	return overall_success

	# 사용 예시
	if __name__ == "__main__":
	# 파일 경로 설정
	rank0_file = "model_rank_0.pt" # 실제 파일명으로 변경
	rank1_file = "model_rank_1.pt" # 실제 파일명으로 변경
	output_file = "merged_model.safetensors"

	# dtype 옵션 설정
	target_dtype = torch.bfloat16 # bf16으로 변환

	# 기본 병합
	print("Starting merge process...")
	merged_dict = merge_fsdp_to_safetensors(rank0_file, rank1_file, output_file, target_dtype)

	# 종합적인 검증
	print("\nStarting comprehensive verification...")
	verification_passed = comprehensive_verification(rank0_file, rank1_file, output_file)

	if verification_passed:
	print(f"\n🎉 Successfully merged and verified FSDP model to {output_file}")
	else:
	print(f"\n⚠️ Merge completed but verification found issues. Please review the output above.")

	# 추가: 간단한 로드 테스트
	print(f"\n🔍 Testing if merged model can be loaded...")
	try:
	from safetensors import safe_open
	with safe_open(output_file, framework="pt", device="cpu") as f:
	sample_keys = list(f.keys())[:3]
	for key in sample_keys:
	tensor = f.get_tensor(key)
	print(f" ✅ Successfully loaded {key}: {tensor.shape}, dtype: {tensor.dtype}")
	print(" ✅ Merged model loads correctly!")
	except Exception as e:
	print(f" ❌ Error loading merged model: {e}")

	print(f"\n💡 Tip: To change dtype, modify 'target_dtype' in the script:")
	print(f" - torch.float16 for fp16 (smaller file, less precision)")
	print(f" - torch.bfloat16 for bf16 (good balance)")
	print(f" - torch.float32 for fp32 (larger file, best precision)")
	print(f" - None to keep original dtypes")