UltraThinking-LLM-Training / scripts /profile_model.py

Upload folder using huggingface_hub

54c5666 verified 4 months ago

6.45 kB

	"""Profile model performance and memory usage"""
	import sys
	import os
	sys.path.append(os.path.join(os.path.dirname(__file__), '..'))

	import torch
	import time
	import argparse
	from torch.profiler import profile, ProfilerActivity, record_function
	from src.models.ultrathink import UltraThinkModel, UltraThinkConfig
	from src.models.architecture import ModelConfig


	def profile_model(model, input_shape=(2, 512), device='cuda', num_iters=10):
	"""Profile model forward and backward pass"""
	print(f"Profiling model on {device}")
	print(f"Input shape: {input_shape}")
	print(f"Number of iterations: {num_iters}")
	print("=" * 60)

	model = model.to(device)
	model.train()

	vocab_size = model.config.model_config.vocab_size
	batch_size, seq_length = input_shape

	# Dummy input
	input_ids = torch.randint(0, vocab_size, input_shape, device=device)
	labels = torch.randint(0, vocab_size, input_shape, device=device)

	print("Warming up...")
	# Warmup
	for _ in range(5):
	with torch.no_grad():
	output = model(input_ids=input_ids, labels=labels)
	if device == 'cuda':
	torch.cuda.synchronize()

	print("Profiling...")
	# Profile
	with profile(
	activities=[ProfilerActivity.CPU] + ([ProfilerActivity.CUDA] if device == 'cuda' else []),
	record_shapes=True,
	profile_memory=True,
	with_stack=True
	) as prof:
	for i in range(num_iters):
	with record_function(f"iteration_{i}"):
	output = model(input_ids=input_ids, labels=labels)
	loss = output['loss']
	loss.backward()

	if device == 'cuda':
	torch.cuda.synchronize()

	# Print results
	print("\n" + "=" * 60)
	print("Top 20 Operations by Time")
	print("=" * 60)
	sort_key = "cuda_time_total" if device == 'cuda' else "cpu_time_total"
	print(prof.key_averages().table(
	sort_by=sort_key, row_limit=20
	))

	# Memory stats
	if torch.cuda.is_available() and device == 'cuda':
	print("\n" + "=" * 60)
	print("GPU Memory Statistics")
	print("=" * 60)
	print(f"Allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
	print(f"Reserved: {torch.cuda.memory_reserved() / 1e9:.2f} GB")
	print(f"Max Allocated: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB")
	print(f"Max Reserved: {torch.cuda.max_memory_reserved() / 1e9:.2f} GB")

	# Timing stats
	print("\n" + "=" * 60)
	print("Performance Metrics")
	print("=" * 60)

	# Measure inference time
	torch.cuda.synchronize() if device == 'cuda' else None
	start = time.time()
	for _ in range(10):
	with torch.no_grad():
	_ = model(input_ids=input_ids)
	torch.cuda.synchronize() if device == 'cuda' else None
	end = time.time()

	avg_time = (end - start) / 10
	tokens_per_sec = (batch_size * seq_length) / avg_time

	print(f"Average forward pass time: {avg_time * 1000:.2f} ms")
	print(f"Throughput: {tokens_per_sec:.0f} tokens/second")
	print(f"Throughput: {batch_size / avg_time:.2f} samples/second")

	return prof


	def build_test_model(size='tiny'):
	"""Build a test model of specified size"""
	configs = {
	'tiny': {
	'n_embd': 256,
	'n_layer': 4,
	'n_head': 4,
	'intermediate_size': 1024
	},
	'small': {
	'n_embd': 768,
	'n_layer': 12,
	'n_head': 12,
	'intermediate_size': 3072
	},
	'medium': {
	'n_embd': 1024,
	'n_layer': 24,
	'n_head': 16,
	'intermediate_size': 4096
	}
	}

	model_params = configs.get(size, configs['tiny'])

	model_config = ModelConfig(
	vocab_size=50257,
	n_positions=512,
	n_embd=model_params['n_embd'],
	n_layer=model_params['n_layer'],
	n_head=model_params['n_head'],
	n_kv_head=model_params['n_head'] // 2,
	intermediate_size=model_params['intermediate_size'],
	flash_attention=False, # For CPU compatibility
	gradient_checkpointing=False
	)

	config = UltraThinkConfig(
	model_config=model_config,
	enable_dre=False,
	enable_constitutional=False,
	enable_moe=False,
	enable_multimodal=False,
	enable_rlhf=False
	)

	return UltraThinkModel(config)


	def main():
	parser = argparse.ArgumentParser(description='Profile ULTRATHINK model')
	parser.add_argument('--size', type=str, default='tiny', choices=['tiny', 'small', 'medium'],
	help='Model size')
	parser.add_argument('--batch_size', type=int, default=2, help='Batch size')
	parser.add_argument('--seq_length', type=int, default=512, help='Sequence length')
	parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu',
	help='Device to use')
	parser.add_argument('--num_iters', type=int, default=10, help='Number of profiling iterations')
	parser.add_argument('--export_trace', type=str, default=None,
	help='Path to export Chrome trace')

	args = parser.parse_args()

	print("Building model...")
	model = build_test_model(args.size)

	# Count parameters
	total_params = sum(p.numel() for p in model.parameters())
	trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

	print(f"Model size: {args.size}")
	print(f"Total parameters: {total_params:,}")
	print(f"Trainable parameters: {trainable_params:,}")
	print(f"Parameter size: {total_params * 4 / 1e9:.2f} GB (float32)")
	print()

	# Profile
	prof = profile_model(
	model,
	input_shape=(args.batch_size, args.seq_length),
	device=args.device,
	num_iters=args.num_iters
	)

	# Export trace
	if args.export_trace:
	prof.export_chrome_trace(args.export_trace)
	print(f"\nTrace exported to: {args.export_trace}")
	print("View in Chrome at: chrome://tracing")


	if __name__ == "__main__":
	main()