WangKaiLin commited on
Commit
f27828b
·
verified ·
1 Parent(s): 90af71a

Upload 5 files

Browse files
Files changed (5) hide show
  1. LICENSE +21 -0
  2. config.json +6 -0
  3. engine.py +512 -0
  4. quickstart.py +25 -0
  5. vocabulary.json +0 -0
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2026 galaxy4552
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "pipeowl",
3
+ "architecture": "semantic-field-retrieval",
4
+ "embedding_dim": 1024,
5
+ "vocab_size": 495090
6
+ }
engine.py ADDED
@@ -0,0 +1,512 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ import re
6
+ from dataclasses import dataclass
7
+ from safetensors.numpy import load_file
8
+ from typing import Dict, List, Tuple, Optional
9
+ import numpy as np # type: ignore
10
+ from pathlib import Path
11
+
12
+ BASE_DIR = Path(__file__).resolve()
13
+ data = load_file("pipeowl_fp16.safetensors")
14
+
15
+ @dataclass
16
+ class PipeOwlConfig:
17
+ """
18
+ 全域設定。
19
+
20
+ embeddings_path:
21
+ 語義場的基底向量矩陣 (V, D)
22
+ V = 詞彙數
23
+ D = 向量維度
24
+
25
+ delta_scalar_path:
26
+ 每個 token 對應的一維場偏移量 (V,)
27
+ 用來做 score 偏移(目前為靜態 bias)
28
+
29
+ vocab_path:
30
+ vocab list,必須與 embeddings 順序完全對齊。
31
+ index i <-> emb[i] <-> delta[i]
32
+
33
+ alpha:
34
+ base 相似度權重
35
+
36
+ beta:
37
+ delta 權重(目前為 logit bias,不是動態 loss)
38
+
39
+ top_k:
40
+ retrieval 預設回傳數量
41
+
42
+ 佔位:
43
+ temperature:
44
+ decode 階段採樣溫度
45
+
46
+ max_new_tokens:
47
+ decode 最大生成長度
48
+ """
49
+ ROOT_DIR = BASE_DIR.parent
50
+ safetensors_path: str = str(ROOT_DIR / "pipeowl.safetensors")
51
+ vocab_path: str = str(ROOT_DIR / "vocabulary.json")
52
+
53
+ # ChatGPT note: perf toggles
54
+ normalize_rows: bool = True # True: enforce row-wise normalization for cosine==dot
55
+ ensure_contiguous: bool = True # True: make emb contiguous for faster GEMV
56
+ max_token_len_cap: int = 32 # cap tokenizer max token length to prevent slow path / garbage vocab
57
+
58
+ # scoring: score = alpha * base + beta * delta_field
59
+ alpha: float = 1.0
60
+ ## 定值
61
+ beta: float = 0.05
62
+
63
+ """能量式 未開啟
64
+ beta: Optional[float] = None
65
+ def __post_init__(self):
66
+ if self.beta is None:
67
+ self.beta = 1.0 - self.alpha
68
+ """
69
+
70
+ # retrieval
71
+ top_k: int = 16
72
+
73
+ # decode
74
+ temperature: float = 0.8
75
+ max_new_tokens: int = 64
76
+
77
+ ## semanticizer
78
+ class VocabTokenizer:
79
+ """
80
+ 字串最大匹配 tokenizer。
81
+
82
+ 設計目標:
83
+ 將輸入文字拆成 vocab 中存在的 token。
84
+
85
+ 方法:
86
+ - 使用最大長度優先匹配
87
+
88
+ 適用情境:
89
+ vocab 是字 / 詞 級別,且已對齊 embedding。
90
+ """
91
+ def __init__(self, vocab_list, *, max_len_cap: Optional[int] = None):
92
+ self.vocab_set = set(vocab_list)
93
+
94
+ mx = max(len(t) for t in vocab_list)
95
+ if max_len_cap is not None:
96
+ mx = min(mx, int(max_len_cap))
97
+ self.max_len = mx
98
+
99
+ """ ## 舊pipeline升級fallback tokenizer
100
+ def tokenize(self, text: str):
101
+ tokens = []
102
+ i = 0
103
+ n = len(text)
104
+
105
+ while i < n:
106
+ matched = False
107
+ for L in range(self.max_len, 0, -1):
108
+ if i + L <= n:
109
+ piece = text[i:i+L]
110
+ if piece in self.vocab_set:
111
+ tokens.append(piece)
112
+ i += L
113
+ matched = True
114
+ break
115
+ if not matched:
116
+ i += 1
117
+ return tokens
118
+ """
119
+
120
+ def tokenize(self, text):
121
+
122
+ text = text.lower()
123
+
124
+ words = re.findall(r"[a-zA-Z]+", text)
125
+ if words:
126
+ return words
127
+
128
+ # 先嘗試 vocab 最大匹配
129
+ tokens = []
130
+ i = 0
131
+ n = len(text)
132
+
133
+ while i < n:
134
+
135
+ matched = False
136
+
137
+ for L in range(self.max_len, 0, -1):
138
+
139
+ if i + L <= n:
140
+ piece = text[i:i+L]
141
+
142
+ if piece in self.vocab_set:
143
+ tokens.append(piece)
144
+ i += L
145
+ matched = True
146
+ break
147
+
148
+ if not matched:
149
+ i += 1
150
+
151
+ return tokens
152
+
153
+ class PipeOwlEngine:
154
+ """
155
+ PipeOwl 幾何語義引擎核心。
156
+
157
+ 設計哲學:
158
+ index = 語義場座標
159
+
160
+ emb[i] -> 詞向量
161
+ delta[i] -> 詞的場偏移量
162
+ vocab[i] -> 詞本身
163
+
164
+ 核心流程:
165
+ text
166
+
167
+ tokenize
168
+
169
+ mean embedding
170
+
171
+ score = alpha*base + beta*delta
172
+
173
+ top-k
174
+
175
+ decode
176
+
177
+ 這是一個:
178
+ Field-based retrieval language system
179
+ """
180
+
181
+ def __init__(self, cfg: PipeOwlConfig):
182
+ self.cfg = cfg
183
+
184
+ #self.emb: np.ndarray = None # (V, D) float32
185
+ #self.delta: np.ndarray = None # (V,) float32
186
+ self.emb = data["embeddings"].astype(np.float32)
187
+ self.delta = data["delta_field"].astype(np.float32)
188
+ self.token_to_id: Dict[str, int] = {}
189
+ self.id_to_token: List[str] = []
190
+
191
+ # Decoder (optional)
192
+ self.decoder = MicroGPTDecoder() # inference-only stub; plug your trained weights later
193
+
194
+ self._load_assets()
195
+
196
+ # -------------------------
197
+ # asset loading
198
+ # -------------------------
199
+
200
+ def _load_assets(self) -> None:
201
+ """
202
+ 載入語義場資產。
203
+
204
+ 載入內容:
205
+ 1. embeddings (V, D)
206
+ 2. delta scalar (V,)
207
+ 3. vocab list (V,)
208
+
209
+ 關鍵假設:
210
+ 三者必須 index 完全對齊。
211
+
212
+ 幾何意義:
213
+ 每個 index i 對應語義空間中的一個固定場點。
214
+
215
+ """
216
+ if not os.path.exists(self.cfg.vocab_path):
217
+ raise FileNotFoundError(self.cfg.vocab_path)
218
+
219
+ emb = self.emb
220
+
221
+ # embeddings: (V, D)
222
+
223
+ if emb.dtype != np.float32:
224
+ emb = emb.astype(np.float32, copy=False)
225
+
226
+ # ChatGPT note: make C-contiguous for faster GEMV
227
+ if self.cfg.ensure_contiguous and not emb.flags["C_CONTIGUOUS"]:
228
+ emb = np.ascontiguousarray(emb)
229
+
230
+ # ChatGPT note: normalize rows once so cosine == dot (avoid per-query normalization cost)
231
+ if self.cfg.normalize_rows:
232
+ # If it's memmap and you want to keep it read-only, we need a normal ndarray copy anyway.
233
+ if isinstance(emb, np.memmap):
234
+ emb = np.array(emb, copy=True)
235
+
236
+ # delta: (V,)
237
+ self.delta = data["delta_field"]
238
+ if self.delta.dtype != np.float32:
239
+ self.delta = self.delta.astype(np.float32, copy=False)
240
+
241
+ if self.emb.ndim != 2:
242
+ raise ValueError(f"embeddings must be 2D (V, D), got shape={self.emb.shape}")
243
+
244
+ # (V, D)
245
+ V, _ = self.emb.shape
246
+
247
+ if self.delta.ndim != 1 or self.delta.shape[0] != V:
248
+ raise ValueError(f"delta must be shape (V,), got {self.delta.shape}, expected ({V},)")
249
+
250
+ # vocab json: build token_to_id and id_to_token
251
+ with open(self.cfg.vocab_path, "r", encoding="utf-8-sig") as f:
252
+ vocab_list = json.load(f)
253
+
254
+ if not isinstance(vocab_list, list):
255
+ raise ValueError("vocab must be a list for geometric field mode")
256
+
257
+ if len(vocab_list) != V:
258
+ raise ValueError(f"vocab size {len(vocab_list)} != embeddings V {V}")
259
+
260
+ self.vocab = vocab_list
261
+ self.id_to_token = vocab_list
262
+ self.token_to_id = {t: i for i, t in enumerate(vocab_list)}
263
+
264
+ self.tokenizer = VocabTokenizer(self.vocab)
265
+
266
+ # -------------------------
267
+ # encode (from vector library)
268
+ # -------------------------
269
+
270
+ def encode(self, text: str):
271
+ """
272
+ 將文字投影到語義場中。
273
+
274
+ 流程:
275
+ 1. tokenize -> token list
276
+ 2. 取每個 token 對應 emb
277
+ 3. 做 mean pooling
278
+ 4. normalize
279
+
280
+ 數學形式:
281
+ q = normalize( mean( emb[token_i] ) )
282
+
283
+ 幾何意義:
284
+ 這是在語義場中求質心。
285
+
286
+ 風險:
287
+ - mean pooling 會削弱方向性
288
+ """
289
+ """##這是舊代碼 因為想實驗貓頭鷹和鴞之間拉近距離
290
+ tokens = self.tokenizer.tokenize(text)
291
+
292
+ vecs = []
293
+ for t in tokens:
294
+ idx = self.token_to_id[t]
295
+ vecs.append(self.emb[idx])
296
+
297
+ if not vecs:
298
+ return np.zeros(self.emb.shape[1], dtype=np.float32)
299
+
300
+ q = np.mean(vecs, axis=0)
301
+ q /= (np.linalg.norm(q) + 1e-12)
302
+ return q
303
+ """
304
+ # ChatGPT note: exact token fast-path (prevents "貓頭鷹 = mean(貓,頭,鷹)" pollution)
305
+ idx0 = self.token_to_id.get(text)
306
+ if idx0 is not None:
307
+ v = self.emb[idx0].astype(np.float32, copy=False)
308
+ # emb rows already normalized if cfg.normalize_rows=True; keep safe anyway:
309
+ v = v / (np.linalg.norm(v) + 1e-12)
310
+ return v
311
+
312
+ tokens = self.tokenizer.tokenize(text)
313
+ if not tokens:
314
+ return np.zeros(self.emb.shape[1], dtype=np.float32)
315
+
316
+ vecs = []
317
+ wts = []
318
+ for t in tokens:
319
+ idx = self.token_to_id.get(t)
320
+
321
+ ## 原本
322
+ ## idx = self.token_to_id[t]
323
+
324
+ ## safety test
325
+ if idx is None:
326
+ continue
327
+
328
+ vecs.append(self.emb[idx])
329
+ # length weight: longer token contributes more semantics
330
+ wts.append(max(1, len(t)))
331
+
332
+ vecs = np.stack(vecs, axis=0).astype(np.float32, copy=False)
333
+ wts = np.asarray(wts, dtype=np.float32)
334
+ q = np.average(vecs, axis=0, weights=wts)
335
+ q /= (np.linalg.norm(q) + 1e-12)
336
+ return q
337
+
338
+ # -------------------------
339
+ # loss / scoring (delta)
340
+ # -------------------------
341
+ def score_vocab(self, q: np.ndarray, alpha: Optional[float] = None, beta: Optional[float] = None) -> np.ndarray:
342
+ """
343
+ 計算每個 vocab token 的場分數。
344
+
345
+ base:
346
+ emb @ q
347
+ 若 emb 與 q 已正規化,則為 cosine similarity。
348
+
349
+ delta:
350
+ 每個 token 的靜態場偏移量。
351
+
352
+ 目前語義:
353
+ delta 是 logit bias。
354
+ 不是 loss、不是 energy gradient。
355
+
356
+ """
357
+ a = self.cfg.alpha if alpha is None else float(alpha)
358
+ b = self.cfg.beta if beta is None else float(beta)
359
+
360
+ base = self.emb @ q # (V,)
361
+ score = a * base + b * self.delta
362
+ return score.astype(np.float32, copy=False)
363
+
364
+ def topk(self, score: np.ndarray, k: Optional[int] = None) -> List[Tuple[str, float]]:
365
+ """
366
+ 取前 k 高分 token。
367
+
368
+ 使用 argpartition 提升效率。
369
+
370
+ 回傳:
371
+ [(token_string, score), ...]
372
+
373
+ 幾何意義:
374
+ 找出最接近 query 向量(含場偏移)的場點。
375
+
376
+ 注意:
377
+ score 可能 > 1(因為加入 delta)。
378
+ """
379
+ k = self.cfg.top_k if k is None else int(k)
380
+ k = max(1, min(k, score.shape[0]))
381
+
382
+ # argpartition for speed
383
+ idx = np.argpartition(-score, k - 1)[:k]
384
+ idx = idx[np.argsort(-score[idx])]
385
+
386
+ out = []
387
+ for i in idx:
388
+ tok = self.id_to_token[i] if i < len(self.id_to_token) else str(i)
389
+ out.append((tok, float(score[i])))
390
+ return out
391
+
392
+ # -------------------------
393
+ # decode (microgpt inference-only)
394
+ # -------------------------
395
+ def decode(self, prompt_tokens: List[str]) -> str:
396
+ """
397
+ Decode 階段。
398
+
399
+ 目前行為:
400
+ 將 top tokens 拼成 prompt 字串,
401
+ 丟給 microgpt stub。
402
+
403
+ 設計定位:
404
+ retrieval 與 generation 分離。
405
+
406
+ 現狀:
407
+ microgpt 尚未接上真實權重,
408
+ 目前只是 pipeline 占位。
409
+ """
410
+
411
+ prompt = " ".join([t for t in prompt_tokens if t])
412
+ return self.decoder.generate(
413
+ prompt=prompt,
414
+ temperature=self.cfg.temperature,
415
+ max_new_tokens=self.cfg.max_new_tokens,
416
+ )
417
+
418
+ # -------------------------
419
+ # one-shot pipeline
420
+ # -------------------------
421
+ def pipeowl(
422
+ self,
423
+ text: str,
424
+ *,
425
+ top_k: Optional[int] = None,
426
+ alpha: Optional[float] = None,
427
+ beta: Optional[float] = None,
428
+ temperature: Optional[float] = None,
429
+ max_new_tokens: Optional[int] = None,
430
+ ) -> Dict[str, object]:
431
+ """
432
+ 單次完整 pipeline。
433
+
434
+ 流程:
435
+ text
436
+
437
+ encode
438
+
439
+ score_vocab
440
+
441
+ topk
442
+
443
+ decode
444
+
445
+ 回傳:
446
+ {
447
+ "query": 原始文字,
448
+ "retrieved": top-k token + 分數,
449
+ "prompt": 用於 decode 的 token 串,
450
+ "decoded": 生成結果
451
+ }
452
+
453
+ 這是語義場查詢的一次完整觀測。
454
+ """
455
+ q = self.encode(text)
456
+ s = self.score_vocab(q, alpha=alpha, beta=beta)
457
+ retrieved = self.topk(s, k=top_k)
458
+
459
+ # build a prompt from top tokens (simple & deterministic)
460
+ prompt_tokens = [t for (t, _) in retrieved[: min(len(retrieved), 8)]]
461
+ if temperature is not None:
462
+ self.cfg.temperature = float(temperature)
463
+ if max_new_tokens is not None:
464
+ self.cfg.max_new_tokens = int(max_new_tokens)
465
+
466
+ decoded = self.decode(prompt_tokens)
467
+ return {
468
+ "query": text,
469
+ "retrieved": retrieved,
470
+ "prompt": " ".join(prompt_tokens),
471
+ "decoded": decoded,
472
+ }
473
+
474
+
475
+ # ----------------------------------------------------------------------
476
+ # microgpt inference-only stub
477
+ # ----------------------------------------------------------------------
478
+ class MicroGPTDecoder:
479
+ """
480
+ 推理階段占位 decoder。
481
+
482
+ 設計目的:
483
+ 讓 pipeline 可運行,
484
+ 未來可替換為:
485
+ - 已訓練 microGPT
486
+ - 外部 LLM
487
+ - 或場驅動 sampling 模型
488
+
489
+ 現在只是 scaffold。
490
+
491
+ Inference-only placeholder.
492
+
493
+ Why placeholder?
494
+ - Your pasted microGPT file trains its own weights in-process.
495
+ - For a real decode stage, you want:
496
+ (A) load a trained state_dict from disk, OR
497
+ (B) keep a tiny trained model in memory, OR
498
+ (C) use microGPT purely as a sampler over a learned char vocab.
499
+
500
+ This class is the stable interface. Plug your implementation later.
501
+ """
502
+
503
+ def __init__(self):
504
+ # If you already have trained weights, add:
505
+ # self.state_dict = load(...)
506
+ pass
507
+
508
+ def generate(self, prompt: str, temperature: float = 0.8, max_new_tokens: int = 64) -> str:
509
+ # Minimal safe fallback: return prompt as “decoded” scaffold.
510
+ # Replace this with your microgpt forward+sampling once you have weights.
511
+ # (This keeps the pipeline callable today.)
512
+ return f"[microgpt_stub] {prompt}"
quickstart.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from engine import PipeOwlEngine, PipeOwlConfig
2
+ import time
3
+
4
+ engine = PipeOwlEngine(PipeOwlConfig())
5
+
6
+
7
+ # latency test code 33~34ms
8
+ for _ in range(20):
9
+ t0 = time.perf_counter()
10
+ engine.pipeowl("雪鴞")
11
+ print((time.perf_counter() - t0) * 1000, "ms")
12
+
13
+
14
+ while True:
15
+ query = input("Please enter words: ")
16
+
17
+ out = engine.pipeowl(query, top_k=5)
18
+
19
+ print("\nTop-K Tokens:")
20
+ for text, score in out["retrieved"]:
21
+ print(f"{score:.3f} | {text}")
22
+
23
+ print("\nDecoded:")
24
+ print(out["decoded"])
25
+ print()
vocabulary.json ADDED
The diff for this file is too large to render. See raw diff