import torch; import habana_frameworks.torch.core as htcore msize=2 A = torch.randn(msize,msize,dtype=torch.bfloat16).to("hpu") B = torch.randn(msize,msize,dtype=torch.bfloat16).to("hpu") C = torch.randn(msize,msize,dtype=torch.bfloat16).to("hpu") torch.matmul(A,B,out=C) torch.hpu.synchronize() R = C.to("cpu") print(A) print(B) print(C) print(R)