art0123 commited on
Commit
170c186
·
verified ·
1 Parent(s): f12f268

Upload 4 files

Browse files
.gitattributes CHANGED
@@ -44,3 +44,4 @@ insightface/insightface-0.7.3-cp311-cp311-win_amd64.whl filter=lfs diff=lfs merg
44
  insightface/insightface-0.7.3-cp312-cp312-win_amd64.whl filter=lfs diff=lfs merge=lfs -text
45
  insightface/insightface-0.7.3-cp313-cp313-win_amd64.whl filter=lfs diff=lfs merge=lfs -text
46
  insightface/insightface-0.7.3-cp39-cp39-win_amd64.whl filter=lfs diff=lfs merge=lfs -text
 
 
44
  insightface/insightface-0.7.3-cp312-cp312-win_amd64.whl filter=lfs diff=lfs merge=lfs -text
45
  insightface/insightface-0.7.3-cp313-cp313-win_amd64.whl filter=lfs diff=lfs merge=lfs -text
46
  insightface/insightface-0.7.3-cp39-cp39-win_amd64.whl filter=lfs diff=lfs merge=lfs -text
47
+ triton/VC_redist.x64.exe filter=lfs diff=lfs merge=lfs -text
triton/VC_redist.x64.exe ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc0ff0eb1dc3f5188ae6300faef32bf5beeba4bdd6e8e445a9184072096b713b
3
+ size 25635768
triton/python_3.11.9_include_libs.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2cc1d8e2cad23401c152d690669605d4356e12c638954d1e9231b85e1c79966
3
+ size 372089
triton/python_3.12.7_include_libs.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:145cde69c753dd5a78f4525ef1f5cbe0f2642678fdefd9e79e2bbea1a91d80ea
3
+ size 437666
triton/test_triton.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import triton
3
+ import triton.language as tl
4
+
5
+ @triton.jit
6
+ def add_kernel(x_ptr, y_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr):
7
+ pid = tl.program_id(axis=0)
8
+ block_start = pid * BLOCK_SIZE
9
+ offsets = block_start + tl.arange(0, BLOCK_SIZE)
10
+ mask = offsets < n_elements
11
+ x = tl.load(x_ptr + offsets, mask=mask)
12
+ y = tl.load(y_ptr + offsets, mask=mask)
13
+ output = x + y
14
+ tl.store(output_ptr + offsets, output, mask=mask)
15
+
16
+ def add(x: torch.Tensor, y: torch.Tensor):
17
+ output = torch.empty_like(x)
18
+ n_elements = output.numel()
19
+ grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
20
+ add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
21
+ return output
22
+
23
+ a = torch.rand(3, device="cuda")
24
+ b = a + a
25
+ b_compiled = add(a, a)
26
+ print(b_compiled - b)
27
+ print("If you see tensor([0., 0., 0.], device='cuda:0'), then it works")