init commit

Browse files

Files changed (11) hide show

inference/convert2.py +1 -1
inference/generate.py +4 -3
inference/model.py +26 -25
inference/runLLM.sh +1 -0
zk/babel.config.cjs +3 -0
zk/config.json +4 -0
zk/package.json +46 -0
zk/runZK.py +408 -0
zk/src/.DS_Store +0 -0
zk/src/index.ts +0 -0
zk/tsconfig.json +26 -0

inference/convert2.py CHANGED Viewed

@@ -32,7 +32,7 @@ mapping = {
 }
 EmbedsInOneFile = 256
-EmbedsZKDir = "zkdata/embeds/"
 wkv_b_1_rescales = [32, 34, 37, 36, 33, 32, 33, 33, 30, 32,
                    32, 30, 31, 30, 29, 30, 29, 30, 29, 29,

 }
 EmbedsInOneFile = 256
+EmbedsZKDir = "../zkdata/embeds/"
 wkv_b_1_rescales = [32, 34, 37, 36, 33, 32, 33, 33, 30, 32,
                    32, 30, 31, 30, 29, 30, 29, 30, 29, 29,

inference/generate.py CHANGED Viewed

@@ -14,7 +14,8 @@ from model import Transformer, ModelArgs, Block
 from concurrent.futures import ThreadPoolExecutor
 from kernel import softmax_q21, softmax_q19
-snark = False
 model = None
 kv_caches = [ torch.zeros(1, 4096 * 4, 512, dtype=torch.int64) ] * 61
@@ -215,8 +216,8 @@ def generate(
         print(str(cur_pos) + ' ---------- token list: ' + str(tokens[0][prev_pos:cur_pos].tolist()), flush=True)
         if snark:
-            os.makedirs(f'zkdata/pos_{prev_pos}', exist_ok=True)
-            saveTensor(f'zkdata/pos_{prev_pos}/tokens.bin', tokens[0][prev_pos:cur_pos].cpu())
         # logits = model.forward(tokens[:, prev_pos:cur_pos], prev_pos)

 from concurrent.futures import ThreadPoolExecutor
 from kernel import softmax_q21, softmax_q19
+snark = True
+zkDataDir = '../zkdata'
 model = None
 kv_caches = [ torch.zeros(1, 4096 * 4, 512, dtype=torch.int64) ] * 61
         print(str(cur_pos) + ' ---------- token list: ' + str(tokens[0][prev_pos:cur_pos].tolist()), flush=True)
         if snark:
+            os.makedirs(f'{zkDataDir}/pos_{prev_pos}', exist_ok=True)
+            saveTensor(f'{zkDataDir}/pos_{prev_pos}/tokens.bin', tokens[0][prev_pos:cur_pos].cpu())
         # logits = model.forward(tokens[:, prev_pos:cur_pos], prev_pos)

inference/model.py CHANGED Viewed

@@ -21,7 +21,8 @@ block_size = 128
 gemm_impl: Literal["bf16", "fp8"] = "bf16"
 attn_impl: Literal["naive", "absorb"] = "absorb"
-snark = False
 @dataclass
 class ModelArgs:
@@ -772,7 +773,7 @@ class MLA(nn.Module):
             # q_down = self.wq_a(x)
             if snark:
-                dirStr = f'zkdata/pos_{start_pos}/layer_{self.layer_id}'
                 os.makedirs(dirStr, exist_ok=True)
                 saveTensor(f'{dirStr}/wq_a_x.bin', x.cpu())
                 saveTensor(f'{dirStr}/wq_a_w.bin', self.wq_a.weight.view(torch.uint32).cpu())
@@ -784,7 +785,7 @@ class MLA(nn.Module):
             (q_normed, rms) = self.q_norm(q_down)
             if snark:
-                dirStr = f'zkdata/pos_{start_pos}/layer_{self.layer_id}'
                 os.makedirs(dirStr, exist_ok=True)
                 saveTensor(f'{dirStr}/q_norm_x.bin', q_down.cpu())
                 saveTensor(f'{dirStr}/q_norm_weight.bin', self.q_norm.weight.view(torch.uint32).cpu())
@@ -809,13 +810,13 @@ class MLA(nn.Module):
         # freqs_cis  的 rescale 为 2^42, 计算之后 q_pe 的 rescale 为 2^19
         if snark:
-            saveTensor(f'zkdata/pos_{start_pos}/layer_{self.layer_id}/q_pe_x.bin', q_pe.cpu())
-            saveTensor(f'zkdata/freqs_cis.bin', freqs_cis.cpu())
         q_pe = apply_rotary_emb(q_pe, freqs_cis)
         if snark:
-            saveTensor(f'zkdata/pos_{start_pos}/layer_{self.layer_id}/q_pe_y.bin', self.q_norm.weight.view(torch.uint32).cpu())
         # 获取key和value的联合表示kv(即公式41中的)和包含位置信息的key表示k_pe(即公式43中的)：输入乘以向下投影矩阵wkv_a后，按照最后一个维度拆分，
         # 前面kv_lora_rank维作为key和value的联合表示，后面qk_rope_head_dim维添加rope位置信息(调用apply_rotary_emb)后得到包含rope位置信息的key表示；
@@ -824,7 +825,7 @@ class MLA(nn.Module):
         kv, kv_rem = self.wkv_a1(x)
         if snark:
-            dirStr = f'zkdata/pos_{start_pos}/layer_{self.layer_id}'
             os.makedirs(dirStr, exist_ok=True)
             saveTensor(f'{dirStr}/wkv_a1_x.bin', x.cpu())
             saveTensor(f'{dirStr}/wkv_a1_w.bin', self.wkv_a1.weight.view(torch.uint32).cpu())
@@ -909,12 +910,12 @@ class MLA(nn.Module):
         # # softmax_q19 会破坏 scores 的原始数据，先拷贝一份数据
         if snark:
-            saveTensor(f'zkdata/pos_{start_pos}/layer_{self.layer_id}/scores_softmax_x.bin', scores.contiguous().cpu())
         softmax_q19(scores.contiguous(), scores_new)
         if snark:
-            saveTensor(f'zkdata/pos_{start_pos}/layer_{self.layer_id}/scores_softmax_y.bin', scores_new.cpu())
         if attn_impl == "naive":
             x = torch.einsum("bsht,bthd->bshd", scores, self.v_cache[:bsz, :end_pos])
@@ -1033,12 +1034,12 @@ class MLP_int(nn.Module):
         # silu_q25(r1, s1)
         if snark:
-            saveTensor(f'zkdata/pos_{start_pos}/layer_{self.layer_id}/mlp_silu_x.bin', r1.contiguous().cpu())
         silu_q23(r1, s1)
         if snark:
-            saveTensor(f'zkdata/pos_{start_pos}/layer_{self.layer_id}/mlp_silu_y.bin', s1.cpu())
         # r2 rescale: 2^23, shape: [1, seqLen, inter_dim]
         r2 = self.w3(x)
@@ -1128,13 +1129,13 @@ class Gate(nn.Module):
             C = torch.empty_like(scores, dtype=torch.int64, device='cuda')
             if snark:
-                saveTensor(f'zkdata/pos_{start_pos}/layer_{self.layer_id}/sigmoid_gate_x.bin', scores.cpu())
-                saveTensor(f'zkdata/pos_{start_pos}/layer_{self.layer_id}/sigmoid_gate_r.bin', scores_rem.cpu())
             sigmoid_q23(scores, C)
             if snark:
-                saveTensor(f'zkdata/pos_{start_pos}/layer_{self.layer_id}/sigmoid_gate_y.bin', C.cpu())
             # 当前 scores shape: [seqLen, 256]
             scores = C.squeeze(0)
@@ -1147,8 +1148,8 @@ class Gate(nn.Module):
             scores = scores + self.bias
         if snark:
-            saveTensor(f'zkdata/pos_{start_pos}/layer_{self.layer_id}/gate_original_scores.bin', original_scores.contiguous().cpu())
-            saveTensor(f'zkdata/pos_{start_pos}/layer_{self.layer_id}/gate_bias.bin', self.bias.view(torch.uint32).cpu())
         # n_groups = 8
         if self.n_groups > 1:
@@ -1192,8 +1193,8 @@ class Gate(nn.Module):
         weights = original_scores.gather(1, indices)
         if snark:
-            saveTensor(f'zkdata/pos_{start_pos}/layer_{self.layer_id}/gate_indices.bin', indices.contiguous().cpu())
-            saveTensor(f'zkdata/pos_{start_pos}/layer_{self.layer_id}/gate_weights.bin', weights.contiguous().cpu())
         # print(f'weights shape: {weights.shape}')
         if self.score_func == "sigmoid":
@@ -1265,12 +1266,12 @@ class Expert_int(nn.Module):
         # silu_q25(r1, s1)
         if snark:
-            saveTensor(f'zkdata/pos_{start_pos}/layer_{self.layer_id}/expert_{self.idx}_silu_x.bin', r1.contiguous().cpu())
         silu_q23(r1, s1)
         if snark:
-            saveTensor(f'zkdata/pos_{start_pos}/layer_{self.layer_id}/expert_{self.idx}_silu_y.bin', s1.cpu())
         # r2 rescale: 2^23
         r2 = self.w3(x)
@@ -1450,11 +1451,11 @@ class Block(nn.Module):
         (atten_normed, rms) = self.attn_norm(x)
         if snark:
-            os.makedirs(f'zkdata/pos_{start_pos}/layer_{self.layer_id}', exist_ok=True)
-            saveTensor(f'zkdata/pos_{start_pos}/layer_{self.layer_id}/attn_norm_x.bin', x.cpu())
-            saveTensor(f'zkdata/pos_{start_pos}/layer_{self.layer_id}/attn_norm_weight.bin', self.attn_norm.weight.view(torch.uint32).cpu())
-            saveTensor(f'zkdata/pos_{start_pos}/layer_{self.layer_id}/attn_norm_y.bin', atten_normed.cpu())
-            saveTensor(f'zkdata/pos_{start_pos}/layer_{self.layer_id}/attn_norm_rms.bin', rms.cpu())
         # attned 的 rescale 是 2^19, shape: [1, seqLen, 7168]
         attned = self.attn(atten_normed, start_pos, freqs_cis, mask)

 gemm_impl: Literal["bf16", "fp8"] = "bf16"
 attn_impl: Literal["naive", "absorb"] = "absorb"
+snark = True
+zkDataDir = '../zkdata'
 @dataclass
 class ModelArgs:
             # q_down = self.wq_a(x)
             if snark:
+                dirStr = f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}'
                 os.makedirs(dirStr, exist_ok=True)
                 saveTensor(f'{dirStr}/wq_a_x.bin', x.cpu())
                 saveTensor(f'{dirStr}/wq_a_w.bin', self.wq_a.weight.view(torch.uint32).cpu())
             (q_normed, rms) = self.q_norm(q_down)
             if snark:
+                dirStr = f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}'
                 os.makedirs(dirStr, exist_ok=True)
                 saveTensor(f'{dirStr}/q_norm_x.bin', q_down.cpu())
                 saveTensor(f'{dirStr}/q_norm_weight.bin', self.q_norm.weight.view(torch.uint32).cpu())
         # freqs_cis  的 rescale 为 2^42, 计算之后 q_pe 的 rescale 为 2^19
         if snark:
+            saveTensor(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}/q_pe_x.bin', q_pe.cpu())
+            saveTensor(f'{zkDataDir}/freqs_cis.bin', freqs_cis.cpu())
         q_pe = apply_rotary_emb(q_pe, freqs_cis)
         if snark:
+            saveTensor(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}/q_pe_y.bin', self.q_norm.weight.view(torch.uint32).cpu())
         # 获取key和value的联合表示kv(即公式41中的)和包含位置信息的key表示k_pe(即公式43中的)：输入乘以向下投影矩阵wkv_a后，按照最后一个维度拆分，
         # 前面kv_lora_rank维作为key和value的联合表示，后面qk_rope_head_dim维添加rope位置信息(调用apply_rotary_emb)后得到包含rope位置信息的key表示；
         kv, kv_rem = self.wkv_a1(x)
         if snark:
+            dirStr = f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}'
             os.makedirs(dirStr, exist_ok=True)
             saveTensor(f'{dirStr}/wkv_a1_x.bin', x.cpu())
             saveTensor(f'{dirStr}/wkv_a1_w.bin', self.wkv_a1.weight.view(torch.uint32).cpu())
         # # softmax_q19 会破坏 scores 的原始数据，先拷贝一份数据
         if snark:
+            saveTensor(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}/scores_softmax_x.bin', scores.contiguous().cpu())
         softmax_q19(scores.contiguous(), scores_new)
         if snark:
+            saveTensor(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}/scores_softmax_y.bin', scores_new.cpu())
         if attn_impl == "naive":
             x = torch.einsum("bsht,bthd->bshd", scores, self.v_cache[:bsz, :end_pos])
         # silu_q25(r1, s1)
         if snark:
+            saveTensor(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}/mlp_silu_x.bin', r1.contiguous().cpu())
         silu_q23(r1, s1)
         if snark:
+            saveTensor(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}/mlp_silu_y.bin', s1.cpu())
         # r2 rescale: 2^23, shape: [1, seqLen, inter_dim]
         r2 = self.w3(x)
             C = torch.empty_like(scores, dtype=torch.int64, device='cuda')
             if snark:
+                saveTensor(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}/sigmoid_gate_x.bin', scores.cpu())
+                saveTensor(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}/sigmoid_gate_r.bin', scores_rem.cpu())
             sigmoid_q23(scores, C)
             if snark:
+                saveTensor(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}/sigmoid_gate_y.bin', C.cpu())
             # 当前 scores shape: [seqLen, 256]
             scores = C.squeeze(0)
             scores = scores + self.bias
         if snark:
+            saveTensor(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}/gate_original_scores.bin', original_scores.contiguous().cpu())
+            saveTensor(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}/gate_bias.bin', self.bias.view(torch.uint32).cpu())
         # n_groups = 8
         if self.n_groups > 1:
         weights = original_scores.gather(1, indices)
         if snark:
+            saveTensor(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}/gate_indices.bin', indices.contiguous().cpu())
+            saveTensor(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}/gate_weights.bin', weights.contiguous().cpu())
         # print(f'weights shape: {weights.shape}')
         if self.score_func == "sigmoid":
         # silu_q25(r1, s1)
         if snark:
+            saveTensor(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}/expert_{self.idx}_silu_x.bin', r1.contiguous().cpu())
         silu_q23(r1, s1)
         if snark:
+            saveTensor(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}/expert_{self.idx}_silu_y.bin', s1.cpu())
         # r2 rescale: 2^23
         r2 = self.w3(x)
         (atten_normed, rms) = self.attn_norm(x)
         if snark:
+            os.makedirs(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}', exist_ok=True)
+            saveTensor(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}/attn_norm_x.bin', x.cpu())
+            saveTensor(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}/attn_norm_weight.bin', self.attn_norm.weight.view(torch.uint32).cpu())
+            saveTensor(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}/attn_norm_y.bin', atten_normed.cpu())
+            saveTensor(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}/attn_norm_rms.bin', rms.cpu())
         # attned 的 rescale 是 2^19, shape: [1, seqLen, 7168]
         attned = self.attn(atten_normed, start_pos, freqs_cis, mask)

inference/runLLM.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ WORLD_SIZE=1 RANK=0 LOCAL_RANK=0 MASTER_ADDR=127.0.0.1 python3 generate.py --ckpt-path /data3/DeepSeek-V3-Demo1 --config configs/config_671B.json --interactive --temperature 1.0 --max-new-tokens 200 > logs/log_$(date +%Y%m%d_%H%M%S).txt 2>&1

zk/babel.config.cjs ADDED Viewed

	@@ -0,0 +1,3 @@

+module.exports = {
+  presets: [['@babel/preset-env', { targets: { node: 'current' } }]],
+};

zk/config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "version": 1,
+  "deployAliases": {}
+}

zk/package.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "name": "dp",
+  "version": "0.1.0",
+  "description": "",
+  "author": "",
+  "license": "Apache-2.0",
+  "keywords": [
+    "mina-zkapp",
+    "mina-zk-app",
+    "mina-dapp",
+    "zkapp"
+  ],
+  "type": "module",
+  "main": "build/src/index.js",
+  "types": "build/src/index.d.ts",
+  "scripts": {
+    "build": "tsc",
+    "buildw": "tsc --watch",
+    "coverage": "node --experimental-vm-modules node_modules/jest/bin/jest.js --coverage",
+    "format": "prettier --write --ignore-unknown **/*",
+    "test": "npm run build && find build/src -name '*.test.js' -exec node --test {} \\;",
+    "testw": "npm run build && find build/src -name '*.test.js' -exec node --test --watch {} \\;",
+    "lint": "npx eslint src/* --fix",
+    "clear-cache": "npx rimraf cache/* !cache/README.md && npx rimraf cache.json && echo \"Cache cleared successfully!\"",
+    "start": "node build/src/run.js"
+  },
+  "devDependencies": {
+    "@babel/preset-env": "^7.16.4",
+    "@babel/preset-typescript": "^7.16.0",
+    "@types/node": "^22.14.1",
+    "@typescript-eslint/eslint-plugin": "^5.5.0",
+    "@typescript-eslint/parser": "^5.5.0",
+    "eslint": "^8.7.0",
+    "eslint-plugin-o1js": "^0.4.0",
+    "prettier": "^2.3.2",
+    "ts-jest": "^29.2.4",
+    "typescript": "^5.4.5",
+    "commander": "^14.0.2"
+  },
+  "peerDependencies": {
+    "o1js": "^2.*"
+  },
+  "engines": {
+    "node": ">=18.14.0"
+  }
+}

zk/runZK.py ADDED Viewed

	@@ -0,0 +1,408 @@

+import asyncio
+import numpy as np
+zkDataDir = '../zkdata'
+async def main():
+    sem1 = asyncio.Semaphore(1)
+    sem7 = asyncio.Semaphore(7)
+    sem8 = asyncio.Semaphore(8)
+    sem32 = asyncio.Semaphore(32)
+    async def taskEmbed():
+        print(f'taskEmbed')
+        fEmbed = open("embed.log", "a", buffering=1)
+        fEmbedErr = open("embedErr.log", "w", buffering=1)
+        data = np.fromfile(f"{zkDataDir}/pos_0/tokens.bin", dtype=np.int64)
+        print('xs: ', data)
+        dataLen = len(data)
+        # 计算 所有 vocabulary embedding 的 hash
+        async def computeHash(tokenId):
+            async with sem32:
+                p = await asyncio.create_subprocess_exec("bash", "-lc", f'node build/src/index.js computeHash embed {tokenId}',
+                                             stdout=fEmbed, stderr=fEmbedErr)
+                rc = await p.wait()
+                return (tokenId, rc)
+        results = await asyncio.gather(*(computeHash(i) for i in range(0, 129280) ))
+        # 汇集所有的 vocabulary embedding 到 hashTable.json 中
+        p = await asyncio.create_subprocess_exec("bash", "-lc", f'node build/src/index.js precomputeHashes embed',
+                                             stdout=fEmbed, stderr=fEmbedErr)
+        rc = await p.wait()
+        # 计算 tokens 的 root hash
+        p = await asyncio.create_subprocess_exec("bash", "-lc", f'node build/src/index.js computeEmbedHash embed',
+                                             stdout=fEmbed, stderr=fEmbedErr)
+        rc = await p.wait()
+        async def taskEmbedBase(rowId):
+            async with sem7:
+                p = await asyncio.create_subprocess_exec("bash", "-lc", f'node build/src/index.js embedSectionBase embed {rowId}',
+                                             stdout=fEmbed, stderr=fEmbedErr)
+                rc = await p.wait()
+                return (rowId, rc)
+        results = await asyncio.gather(*(taskEmbedBase(i) for i in range(0, dataLen) ))
+        async def taskEmbedMerge(rowId):
+            async with sem7:
+                p = await asyncio.create_subprocess_exec("bash", "-lc", f'node build/src/index.js embedSectionMerge embed {rowId}',
+                                             stdout=fEmbed, stderr=fEmbedErr)
+                rc = await p.wait()
+                return (rowId, rc)
+        results = await asyncio.gather(*(taskEmbedMerge(i) for i in range(0, dataLen) ))
+        async def taskEmbedRowsMerge():
+            async with sem7:
+                p = await asyncio.create_subprocess_exec("bash", "-lc", f'node build/src/index.js embedRowsMerge embed',
+                                             stdout=fEmbed, stderr=fEmbedErr)
+                rc = await p.wait()
+                return rc
+        results = await asyncio.gather((taskEmbedRowsMerge() ))
+        fEmbed.close()
+        fEmbedErr.close()
+    async def taskAttnNorm(name, posId, layerId):
+        print(f'taskAttnNorm {name}')
+        fLog = open(f"{name}_Norm.log", "a", buffering=1)
+        fErr = open(f"{name}_NormErr.log", "w", buffering=1)
+        data = np.fromfile(f"{zkDataDir}/pos_0/tokens.bin", dtype=np.int64)
+        print('xs: ', data)
+        dataLen = len(data)
+        async def taskAttnNormBase(rowId, ind):
+            async with sem7:
+                p = await asyncio.create_subprocess_exec("bash", "-lc", f'node build/src/index.js normBase {name} {posId} {layerId} {rowId} {ind}',
+                                             stdout=fLog, stderr=fErr)
+                rc = await p.wait()
+                return (rowId, rc)
+        if name == 'attn_norm':
+            results = await asyncio.gather(*(taskAttnNormBase(i, j) for i in range(0, 24) for j in (0, 32)))
+        elif name == 'q_norm':
+            results = await asyncio.gather(*(taskAttnNormBase(i, 0) for i in range(0, 24)))
+        async def taskAttnNormMerge(rowId, startIdx):
+            async with sem8:
+                rc = 0
+                for j in range(startIdx, 0, -8):
+                    p = await asyncio.create_subprocess_exec("bash", "-lc", f'node build/src/index.js normMerge {name} {posId} {layerId} {rowId} {j}',
+                                                stdout=fLog, stderr=fErr)
+                    rc = await p.wait()
+                return (rowId, rc)
+        if name == 'attn_norm':
+            results = await asyncio.gather(*(taskAttnNormMerge(i, 62) for i in range(0, 24)))
+        elif name == 'q_norm':
+            results = await asyncio.gather(*(taskAttnNormMerge(i, 30) for i in range(0, 24)))
+        async def normWrapRow():
+            async with sem7:
+                p = await asyncio.create_subprocess_exec("bash", "-lc", f'node build/src/index.js normWrapRow {name} {posId} {layerId}',
+                                             stdout=fLog, stderr=fErr)
+                rc = await p.wait()
+                return rc
+        results = await asyncio.gather(normWrapRow())
+        async def normMergeRow():
+            async with sem7:
+                p = await asyncio.create_subprocess_exec("bash", "-lc", f'node build/src/index.js normMergeRow {name} {posId} {layerId}',
+                                             stdout=fLog, stderr=fErr)
+                rc = await p.wait()
+                return rc
+        results = await asyncio.gather(normMergeRow())
+        fLog.close()
+        fErr.close()
+    # gate 中 experts 选择逻辑
+    async def taskExpertSelector(name, posId, layerId):
+        print(f'taskGateExpertSelector')
+        fgate = open(f"{name}_expertSelector.log", "a", buffering=1)
+        fgateErr = open(f"{name}_expertSelectorErr.log", "w", buffering=1)
+        async def taskGroupBase(rowId):
+            async with sem8:
+                p = await asyncio.create_subprocess_exec("bash", "-lc", f'node build/src/index.js expertsGroupBase {name} {posId} {layerId} {rowId}',
+                                             stdout=fgate, stderr=fgateErr)
+                rc = await p.wait()
+                return (rowId, rc)
+        results = await asyncio.gather(*(taskGroupBase(i) for i in range(0, 24) ))
+        async def taskGroupMerge(rowId):
+            async with sem8:
+                p = await asyncio.create_subprocess_exec("bash", "-lc", f'node build/src/index.js expertsGroupMerge {name} {posId} {layerId} {rowId}',
+                                             stdout=fgate, stderr=fgateErr)
+                rc = await p.wait()
+                return (rowId, rc)
+        results = await asyncio.gather(*(taskGroupMerge(i) for i in range(0, 24) ))
+        async def taskSortedGroupBase(rowId):
+            async with sem8:
+                p = await asyncio.create_subprocess_exec("bash", "-lc", f'node build/src/index.js expertsSortedGroupBase {name} {posId} {layerId} {rowId}',
+                                             stdout=fgate, stderr=fgateErr)
+                rc = await p.wait()
+                return (rowId, rc)
+        results = await asyncio.gather(*(taskSortedGroupBase(i) for i in range(0, 24) ))
+        async def taskSortedGroupMerge(rowId):
+            async with sem8:
+                p = await asyncio.create_subprocess_exec("bash", "-lc", f'node build/src/index.js expertsSortedGroupMerge {name}s {posId} {layerId} {rowId}',
+                                             stdout=fgate, stderr=fgateErr)
+                rc = await p.wait()
+                return (rowId, rc)
+        results = await asyncio.gather(*(taskSortedGroupMerge(i) for i in range(0, 24) ))
+        async def taskSelectorBase(rowId):
+            async with sem8:
+                p = await asyncio.create_subprocess_exec("bash", "-lc", f'node build/src/index.js expertsSelectorBase {name} {posId} {layerId} {rowId}',
+                                             stdout=fgate, stderr=fgateErr)
+                rc = await p.wait()
+                return (rowId, rc)
+        results = await asyncio.gather(*(taskSelectorBase(i) for i in range(0, 24) ))
+        async def taskSelectorMerge():
+            async with sem8:
+                p = await asyncio.create_subprocess_exec("bash", "-lc", f'node build/src/index.js expertsSelectorMerge {name} {posId} {layerId}',
+                                             stdout=fgate, stderr=fgateErr)
+                rc = await p.wait()
+                return rc
+        results = await asyncio.gather((taskSelectorMerge() ))
+        fgate.close()
+        fgateErr.close()
+    async def taskRope_pe():
+        print(f'taskRope')
+        fLog = open("rope.log", "a", buffering=1)
+        fErr = open("ropeErr.log", "w", buffering=1)
+        data = np.fromfile(f"{zkDataDir}/pos_0/tokens.bin", dtype=np.int64)
+        print('xs: ', data)
+        dataLen = len(data)
+        async def ropeBase(name, posId, layerId, rowId, ind, f_out, f_err):
+            async with sem7:
+                p = await asyncio.create_subprocess_exec("bash", "-lc", f'node build/src/index.js ropeBase {name} {posId} {layerId} {rowId} {ind}',
+                                                        stdout=f_out, stderr=f_err)
+                rc = await p.wait()
+                return rc
+        results = await asyncio.gather(*(ropeBase('q_pe', 0, 0, i, j, fLog, fErr) for i in range(0, 24) for j in (0, 32, 64, 96) ))
+        async def ropeMerge(name, posId, layerId, rowId, ind, f_out, f_err):
+            async with sem8:
+                p = await asyncio.create_subprocess_exec("bash", "-lc", f'node  build/src/index.js ropeMerge {name} {posId} {layerId} {rowId} {ind}',
+                                                        stdout=f_out, stderr=f_err)
+                rc = await p.wait()
+                return rc
+        for j in range(126, -1, -8):
+            results = await asyncio.gather(*(ropeMerge('q_pe', 0, 0, i, j, fLog, fErr) for i in range(0, 24) ))
+        async def wrapRopeRow(name, posId, layerId, f_out, f_err):
+            p = await asyncio.create_subprocess_exec("bash", "-lc", f'node  build/src/index.js wrapRopeRow {name} {posId} {layerId}',
+                                                    stdout=f_out, stderr=f_err)
+            rc = await p.wait()
+            return rc
+        results = await asyncio.gather(wrapRopeRow('q_pe', 0, 0, fLog, fErr))
+        async def mergeRopeRow(name, posId, layerId, f_out, f_err):
+            p = await asyncio.create_subprocess_exec("bash", "-lc", f'node  build/src/index.js mergeRopeRow {name} {posId} {layerId}',
+                                                    stdout=f_out, stderr=f_err)
+            rc = await p.wait()
+            return rc
+        results = await asyncio.gather(mergeRopeRow('q_pe', 0, 0, fLog, fErr))
+        fLog.close()
+        fErr.close()
+    async def taskSoftmax(name):
+        print(f'taskSoftmax {name}')
+        fLog = open(f"{name}_softmax.log", "a", buffering=1)
+        fErr = open(f"{name}_softmaxErr.log", "w", buffering=1)
+        async def softmaxHeadBase(posId, layerId, rowId, headId, headDim):
+            async with sem7:
+                p = await asyncio.create_subprocess_exec(
+                    "bash", "-lc",
+                    f'node  build/src/index.js softmaxHeadBase {name} {posId} {layerId} {rowId} {headId} {headDim}',
+                    stdout=fLog, stderr=fErr)
+                rc = await p.wait()
+                return rc
+        results = await asyncio.gather(*(softmaxHeadBase(0, 0, i, j, 24) for i in range(0, 24) for j in range(0, 128, 4)))
+        async def softmaxHeadMerge(posId, layerId, rowId, headDim):
+            async with sem8:
+                rc = 0
+                for ind in range(126, -1, -8):
+                    p = await asyncio.create_subprocess_exec(
+                        "bash", "-lc",
+                        f'node build/src/index.js softmaxHeadMerge {name} {posId} {layerId} {rowId} {ind} {headDim}',
+                        stdout=fLog, stderr=fErr)
+                    rc = await p.wait()
+                return (rowId, rc)
+        results = await asyncio.gather(*(softmaxHeadMerge(0, 0, i, 24) for i in range(0, 24)))
+        async def softmaxWrapRow(posId, layerId):
+            p = await asyncio.create_subprocess_exec("bash", "-lc", f'node  build/src/index.js softmaxWrapRow {name} {posId} {layerId}',
+                                                    stdout=fLog, stderr=fErr)
+            rc = await p.wait()
+            return rc
+        results = await asyncio.gather(softmaxWrapRow(0, 0))
+        async def softmaxMergeRow(posId, layerId):
+            p = await asyncio.create_subprocess_exec("bash", "-lc", f'node  build/src/index.js softmaxMergeRow {name} {posId} {layerId}',
+                                                    stdout=fLog, stderr=fErr)
+            rc = await p.wait()
+            return rc
+        results = await asyncio.gather(softmaxMergeRow(0, 0))
+        fLog.close()
+        fErr.close()
+    async def taskSigmoid(name):
+        print(f'taskSigmoid {name}')
+        fLog = open(f"{name}_sigmoid.log", "a", buffering=1)
+        fErr = open(f"{name}_sigmoidErr.log", "w", buffering=1)
+        async def sigmoidSectionBase(posId, layerId, rowId):
+            async with sem8:
+                p = await asyncio.create_subprocess_exec("bash", "-lc", f'node build/src/index.js sigmoidSectionBase {name} {posId} {layerId} {rowId}',
+                                                        stdout=fLog, stderr=fErr)
+                rc = await p.wait()
+                return rc
+        results = await asyncio.gather(*(sigmoidSectionBase(0, 3, i) for i in range(0, 24) ))
+        async def sigmoidSectionMerge(posId, layerId, rowId):
+            async with sem8:
+                p = await asyncio.create_subprocess_exec("bash", "-lc", f'node build/src/index.js sigmoidSectionMerge {name} {posId} {layerId} {rowId}',
+                                             stdout=fLog, stderr=fErr)
+                rc = await p.wait()
+                return rc
+        results = await asyncio.gather(*(sigmoidSectionMerge(0, 3, i) for i in range(0, 24) ))
+        async def sigmoidRowBase(posId, layerId):
+            p = await asyncio.create_subprocess_exec("bash", "-lc", f'node  build/src/index.js sigmoidRowBase {name} {posId} {layerId}',
+                                                    stdout=fLog, stderr=fErr)
+            rc = await p.wait()
+            return rc
+        results = await asyncio.gather(sigmoidRowBase(0, 3))
+        async def sigmoidRowMerge(posId, layerId):
+            p = await asyncio.create_subprocess_exec("bash", "-lc", f'node  build/src/index.js sigmoidRowMerge {name} {posId} {layerId}',
+                                                    stdout=fLog, stderr=fErr)
+            rc = await p.wait()
+            return rc
+        results = await asyncio.gather(sigmoidRowMerge(0, 3))
+        fLog.close()
+        fErr.close()
+    async def taskGemm(name, posId, layerId, InDim, OutDim, ShortDim):
+        print(f'taskGemm {name}')
+        fLog = open(f"{name}_gemm.log", "a", buffering=1)
+        fErr = open(f"{name}_gemmErr.log", "w", buffering=1)
+        data = np.fromfile(f"{zkDataDir}/pos_{posId}/tokens.bin", dtype=np.int64)
+        print('xs: ', data)
+        rowCount = len(data)
+        segmentCount = InDim // ShortDim
+        startIndArr = [i * 32 for i in range(0, segmentCount // 32)]
+        print('startIndArr: ', startIndArr)
+        async def gemmXBase(rowId, ind):
+            async with sem8:
+                p = await asyncio.create_subprocess_exec("bash", "-lc", f'node  build/src/index.js gemmXBase {name} {posId} {layerId} {rowId} {ind}',
+                                                        stdout=fLog, stderr=fErr)
+                rc = await p.wait()
+                return rc
+        results = await asyncio.gather(*(gemmXBase(i, j) for i in range(0, rowCount) for j in startIndArr))
+        async def gemmXMergeRow(ind):
+            async with sem8:
+                p = await asyncio.create_subprocess_exec("bash", "-lc", f'node build/src/index.js gemmXMergeRow {name} {posId} {layerId} {ind}',
+                                                        stdout=fLog, stderr=fErr)
+                rc = await p.wait()
+                return rc
+        results = await asyncio.gather(*(gemmXMergeRow(j) for j in range(segmentCount - 1, 2 * segmentCount - 1)))
+        async def gemmWBase(rowId, ind):
+            async with sem8:
+                p = await asyncio.create_subprocess_exec("bash", "-lc", f'node build/src/index.js gemmWBase {name} {posId} {layerId} {rowId} {ind}',
+                                                        stdout=fLog, stderr=fErr)
+                rc = await p.wait()
+                return rc
+        results = await asyncio.gather(*(gemmWBase(i, j) for i in range(0, OutDim) for j in startIndArr ))
+        async def gemmWMergeRow(ind):
+            async with sem8:
+                rc = 0
+                for rowIndex in range(1, 512, 32):
+                    p = await asyncio.create_subprocess_exec("bash", "-lc", f'node build/src/index.js gemmWMergeRow {name} {posId} {layerId} {ind} {rowIndex}',
+                                                            stdout=fLog, stderr=fErr)
+                    rc = await p.wait()
+                return rc
+        results = await asyncio.gather(*(gemmWMergeRow(j) for j in range(segmentCount - 1, 2 * segmentCount - 1) ))
+        async def gemmXWBase(ind):
+            async with sem8:
+                p = await asyncio.create_subprocess_exec("bash", "-lc", f'node  build/src/index.js  gemmXWBase {name} {posId} {layerId} {ind}',
+                                                        stdout=fLog, stderr=fErr)
+                rc = await p.wait()
+                return rc
+        results = await asyncio.gather(*(gemmXWBase(i) for i in startIndArr))
+        async def gemmXWMerge(ind):
+            async with sem1:
+                p = await asyncio.create_subprocess_exec("bash", "-lc", f'node  build/src/index.js  gemmXWMerge {name} {posId} {layerId} {ind}',
+                                                        stdout=fLog, stderr=fErr)
+                rc = await p.wait()
+                return rc
+        results = await asyncio.gather(*(gemmXWMerge(i) for i in range(segmentCount - 2, -1, -8)))
+        fLog.close()
+        fErr.close()
+    # await taskExpertSelector_gate(0, 4)
+    await taskEmbed()
+    # await taskAttnNorm('attn_norm', 0, 0)
+    # await taskAttnNorm('q_norm')
+    # await taskRope_pe()
+    # await taskSoftmax('scores')
+    # await taskSigmoid('gate')
+    # await taskExpertSelector('gate', 0, 3)
+    # await taskGemm('wkv_a1', 0, 0, 7168, 512, 112)
+    print("all done.")
+asyncio.run(main())

zk/src/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

zk/src/index.ts ADDED Viewed

The diff for this file is too large to render. See raw diff

zk/tsconfig.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "compilerOptions": {
+    "target": "es2021", // goal: ship *the most modern syntax* that is supported by *all* browsers that support our Wasm
+    "module": "nodenext", // allow top-level await
+    "lib": ["dom", "esnext"],
+    "outDir": "./build",
+    "rootDir": ".",
+    "strict": true,
+    "strictPropertyInitialization": false, // to enable generic constructors, e.g. on CircuitValue
+    "skipLibCheck": true,
+    "forceConsistentCasingInFileNames": true,
+    "esModuleInterop": true,
+    "moduleResolution": "nodenext", // comply with node + "type": "module"
+    "experimentalDecorators": true, // needed for decorators used in o1js
+    "emitDecoratorMetadata": true, // needed for decorators used in o1js
+    "allowJs": true, // to use JSDoc in some places where TS would be too cumbersome
+    "declaration": true,
+    "sourceMap": true,
+    "noFallthroughCasesInSwitch": true,
+    "allowSyntheticDefaultImports": true,
+    "useDefineForClassFields": false, // ensure correct behaviour of class fields with decorators
+    "importHelpers": true, // bundle optimization to reduce size
+    "baseUrl": "." // base directory for module resolution
+  },
+  "include": ["./src"]
+}