Astra

File size: 3,786 Bytes

08bf07d

import torch
import time
import argparse
from threading import Thread

def gpu_worker(gpu_id, duration, tensor_size):
    """单个GPU的工作线程，负责持续进行张量运算"""
    try:
        # 设置当前线程使用的GPU
        device = torch.device(f"cuda:{gpu_id}")
        torch.cuda.set_device(device)
        
        # 打印GPU信息
        gpu_name = torch.cuda.get_device_name(device)
        print(f"GPU {gpu_id} 启动: {gpu_name}")
        
        # 创建大随机张量
        tensor_a = torch.randn(tensor_size, tensor_size, device=device)
        tensor_b = torch.randn(tensor_size, tensor_size, device=device)
        
        # 预热GPU
        for _ in range(10):
            result = torch.matmul(tensor_a, tensor_b)
            torch.cuda.synchronize(device)
        
        # 开始持续运算
        start_time = time.time()
        iterations = 0
        
        while time.time() - start_time < duration:
            # 矩阵乘法运算
            result = torch.matmul(tensor_a, tensor_b)
            
            # 定期更新张量避免优化
            if iterations % 100 == 0:
                tensor_a = 0.999 * tensor_a + 0.001 * torch.randn_like(tensor_a)
                tensor_b = 0.999 * tensor_b + 0.001 * torch.randn_like(tensor_b)
            
            iterations += 1
            
            # 每10秒打印一次状态
            if iterations % 1000 == 0:
                elapsed = time.time() - start_time
                print(f"GPU {gpu_id}: 已运行 {elapsed:.1f} 秒, 完成 {iterations} 次迭代")
            
            # 短暂同步确保计算完成
            if iterations % 100 == 0:
                torch.cuda.synchronize(device)
        
        # 计算结束统计
        elapsed = time.time() - start_time
        print(f"GPU {gpu_id} 完成: 总时间 {elapsed:.1f} 秒, 总迭代 {iterations} 次, "
              f"平均每秒 {iterations/elapsed:.2f} 次")
        
    except Exception as e:
        print(f"GPU {gpu_id} 出错: {str(e)}")
    
    finally:
        # 清理内存
        torch.cuda.empty_cache()

def multi_gpu_stress_test(duration, tensor_size, use_gpus=None):
    """多GPU压力测试主函数"""
    # 检查可用GPU数量
    available_gpus = torch.cuda.device_count()
    if available_gpus == 0:
        print("错误: 未检测到可用GPU")
        return
    
    # 确定要使用的GPU
    if use_gpus is None:
        use_gpus = list(range(available_gpus))
    else:
        # 验证GPU ID有效性
        use_gpus = [g for g in use_gpus if 0 <= g < available_gpus]
        if not use_gpus:
            print("错误: 没有有效的GPU ID")
            return
    
    print(f"检测到 {available_gpus} 张GPU，将使用 {len(use_gpus)} 张: {use_gpus}")
    
    # 为每张GPU创建并启动线程
    threads = []
    for gpu_id in use_gpus:
        thread = Thread(target=gpu_worker, args=(gpu_id, duration, tensor_size))
        threads.append(thread)
        thread.start()
    
    # 等待所有线程完成
    for thread in threads:
        thread.join()
    
    print("所有GPU测试完成")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='多GPU压力测试程序')
    parser.add_argument('--duration', type=int, default=6000000, 
                      help='测试持续时间(秒)，默认60秒')
    parser.add_argument('--size', type=int, default=4096, 
                      help='每张GPU上的张量大小，默认4096x4096')
    parser.add_argument('--gpus', type=int, nargs='+', 
                      help=f'指定要使用的GPU ID，如 --gpus 0 1 2 3 4 5 6 7')
    args = parser.parse_args()
    
    # 运行多GPU测试
    multi_gpu_stress_test(args.duration, args.size, args.gpus)