Upload 4 files

Files changed (5) hide show

.gitattributes CHANGED Viewed

@@ -44,3 +44,4 @@ insightface/insightface-0.7.3-cp311-cp311-win_amd64.whl filter=lfs diff=lfs merg
 insightface/insightface-0.7.3-cp312-cp312-win_amd64.whl filter=lfs diff=lfs merge=lfs -text
 insightface/insightface-0.7.3-cp313-cp313-win_amd64.whl filter=lfs diff=lfs merge=lfs -text
 insightface/insightface-0.7.3-cp39-cp39-win_amd64.whl filter=lfs diff=lfs merge=lfs -text

 insightface/insightface-0.7.3-cp312-cp312-win_amd64.whl filter=lfs diff=lfs merge=lfs -text
 insightface/insightface-0.7.3-cp313-cp313-win_amd64.whl filter=lfs diff=lfs merge=lfs -text
 insightface/insightface-0.7.3-cp39-cp39-win_amd64.whl filter=lfs diff=lfs merge=lfs -text
+triton/VC_redist.x64.exe filter=lfs diff=lfs merge=lfs -text

triton/VC_redist.x64.exe ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:cc0ff0eb1dc3f5188ae6300faef32bf5beeba4bdd6e8e445a9184072096b713b
+size 25635768

triton/python_3.11.9_include_libs.zip ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:b2cc1d8e2cad23401c152d690669605d4356e12c638954d1e9231b85e1c79966
+size 372089

triton/python_3.12.7_include_libs.zip ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:145cde69c753dd5a78f4525ef1f5cbe0f2642678fdefd9e79e2bbea1a91d80ea
+size 437666

triton/test_triton.py ADDED Viewed

+import torch
+import triton
+import triton.language as tl
+@triton.jit
+def add_kernel(x_ptr, y_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr):
+    pid = tl.program_id(axis=0)
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    x = tl.load(x_ptr + offsets, mask=mask)
+    y = tl.load(y_ptr + offsets, mask=mask)
+    output = x + y
+    tl.store(output_ptr + offsets, output, mask=mask)
+def add(x: torch.Tensor, y: torch.Tensor):
+    output = torch.empty_like(x)
+    n_elements = output.numel()
+    grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+    add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
+    return output
+a = torch.rand(3, device="cuda")
+b = a + a
+b_compiled = add(a, a)
+print(b_compiled - b)
+print("If you see tensor([0., 0., 0.], device='cuda:0'), then it works")