NMFL / test_multi_chip_gpu.py
Factor Studios
Upload 43 files
520d6cf verified
"""
Test for hyperrealistic multi-chip GPU system with full SM and tensor core realism,
using WebSocket-based storage for zero CPU usage.
"""
import time
import numpy as np
from gpu_arch import Chip, OpticalInterconnect
def test_multi_chip_gpu():
print("\n=== Multi-Chip GPU System with WebSocket Storage Test ===")
num_chips = 2 # Use 2 for realism, scale up as needed
num_sms = 4 # Use 4 for realism, scale up as needed
# Initialize WebSocket storage for all chips
from websocket_storage import WebSocketGPUStorage
storage = WebSocketGPUStorage()
if not storage.wait_for_connection():
raise RuntimeError("Could not connect to GPU storage server")
chips = [Chip(
chip_id=i,
num_sms=num_sms,
vram_size_gb=None # Use unlimited WebSocket storage
) for i in range(num_chips)]
print(f"Created {num_chips} chips with unlimited WebSocket storage, each with {num_sms} SMs.")
# Connect chips in a ring topology with optical interconnect
optical_link = OpticalInterconnect(bandwidth_tbps=800, latency_ns=1)
for i in range(num_chips):
chips[i].connect_chip(chips[(i+1)%num_chips], optical_link)
# Initialize shared WebSocket storage for cross-chip communication
for chip in chips:
chip_state = {
"chip_id": chip.chip_id,
"num_sms": num_sms,
"connected_chips": [(c.chip_id, "optical") for c in chip.connected_chips]
}
storage.store_state(f"chips/{chip.chip_id}", "config", chip_state)
# Run tensor core operations with WebSocket-backed storage
print("\n=== Testing WebSocket-backed Multi-Chip Operations ===")
# Create test matrices
matrix_a = [[1.0, 2.0], [3.0, 4.0]]
matrix_b = [[5.0, 6.0], [7.0, 8.0]]
for chip in chips:
print(f"\n--- Chip {chip.chip_id} ---")
# Store matrices in WebSocket storage for this chip
storage.store_tensor(f"chip_{chip.chip_id}/matrix_a", np.array(matrix_a))
storage.store_tensor(f"chip_{chip.chip_id}/matrix_b", np.array(matrix_b))
# Process using each SM
for sm_id in range(num_sms):
sm = chip.get_sm(sm_id)
# Load matrices from WebSocket storage
matrix_a_data = storage.load_tensor(f"chip_{chip.chip_id}/matrix_a")
matrix_b_data = storage.load_tensor(f"chip_{chip.chip_id}/matrix_b")
# Perform tensor core operation
result = sm.tensor_core_matmul(matrix_a_data.tolist(), matrix_b_data.tolist())
# Store result back in WebSocket storage
storage.store_tensor(f"chip_{chip.chip_id}/sm_{sm_id}/result", np.array(result))
print(f"SM {sm_id} tensor core matmul result: {result}")
# Test cross-chip communication
if len(chip.connected_chips) > 0:
next_chip, link = chip.connected_chips[0]
# Get result from this chip
result_data = storage.load_tensor(f"chip_{chip.chip_id}/sm_0/result")
# Transfer to next chip through optical link
transfer_id = f"transfer_chip_{chip.chip_id}_to_{next_chip.chip_id}"
storage.store_tensor(transfer_id, result_data)
print(f"Transferred result from Chip {chip.chip_id} to Chip {next_chip.chip_id} via {link.__class__.__name__}")
for i in range(len(sm.register_file)):
for j in range(len(sm.register_file[0])):
sm.register_file[i][j] = float(i + j)
for addr in range(sm.shared_mem.size):
sm.shared_mem.write(addr, float(addr % 10))
for addr in range(sm.global_mem.size_bytes if sm.global_mem else 0):
sm.global_mem.write(addr, float(addr % 100))
# Test tensor core matmul from registers
reg_result = sm.tensor_core_matmul_from_memory('register', 0, 'register', 0, (2,2), (2,2))
print(f"SM {sm.sm_id} tensor core matmul from registers: {reg_result}")
# Test tensor core matmul from shared memory
shared_result = sm.tensor_core_matmul_from_memory('shared', 0, 'shared', 0, (2,2), (2,2))
print(f"SM {sm.sm_id} tensor core matmul from shared memory: {shared_result}")
# Test tensor core matmul from global memory
global_result = sm.tensor_core_matmul_from_memory('global', 0, 'global', 0, (2,2), (2,2))
print(f"SM {sm.sm_id} tensor core matmul from global memory: {global_result}")
print("\n=== Multi-Chip GPU System Test Complete ===")
if __name__ == "__main__":
start = time.time()
test_multi_chip_gpu()
print(f"Test runtime: {time.time()-start:.3f} seconds")