1.0
Browse files- LICENSE +21 -0
- data/L1_base_embeddings.npy +3 -0
- data/L1_base_vocab.json +0 -0
- data/delta_base_scalar.npy +3 -0
- engine.py +433 -0
- model_card.md +15 -0
- quickstart.py +16 -0
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2026 galaxy4552
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
data/L1_base_embeddings.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1e6833b0cc4f6ddf1b24824cf4180b23ed4573a5beeabf9ef749dbf834e13036
|
| 3 |
+
size 660230272
|
data/L1_base_vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/delta_base_scalar.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:483a76c516adb27ad80368903d4b44a3d7d402abfb4065bd51d2c3322d652a58
|
| 3 |
+
size 644884
|
engine.py
ADDED
|
@@ -0,0 +1,433 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
from dataclasses import dataclass
|
| 6 |
+
from typing import Dict, List, Tuple, Optional
|
| 7 |
+
import numpy as np # type: ignore
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
BASE_DIR = Path(__file__).resolve()
|
| 11 |
+
DATA_DIR = BASE_DIR.parent / "data"
|
| 12 |
+
|
| 13 |
+
## 我只有推理和抽象成更好的架構 程式碼還是由AI完成
|
| 14 |
+
|
| 15 |
+
@dataclass
|
| 16 |
+
class PipeOwlConfig:
|
| 17 |
+
"""
|
| 18 |
+
全域設定。
|
| 19 |
+
|
| 20 |
+
embeddings_path:
|
| 21 |
+
語義場的基底向量矩陣 (V, D)
|
| 22 |
+
V = 詞彙數
|
| 23 |
+
D = 向量維度
|
| 24 |
+
|
| 25 |
+
delta_scalar_path:
|
| 26 |
+
每個 token 對應的一維場偏移量 (V,)
|
| 27 |
+
用來做 score 偏移(目前為靜態 bias)
|
| 28 |
+
|
| 29 |
+
vocab_path:
|
| 30 |
+
vocab list,必須與 embeddings 順序完全對齊。
|
| 31 |
+
index i <-> emb[i] <-> delta[i]
|
| 32 |
+
|
| 33 |
+
alpha:
|
| 34 |
+
base 相似度權重
|
| 35 |
+
|
| 36 |
+
beta:
|
| 37 |
+
delta 權重(目前為 logit bias,不是動態 loss)
|
| 38 |
+
|
| 39 |
+
top_k:
|
| 40 |
+
retrieval 預設回傳數量
|
| 41 |
+
|
| 42 |
+
temperature:
|
| 43 |
+
decode 階段採樣溫度
|
| 44 |
+
|
| 45 |
+
max_new_tokens:
|
| 46 |
+
decode 最大生成長度
|
| 47 |
+
"""
|
| 48 |
+
embeddings_path: str = str(DATA_DIR / "L1_base_embeddings.npy")
|
| 49 |
+
delta_scalar_path: str = str(DATA_DIR / "delta_base_scalar.npy")
|
| 50 |
+
vocab_path: str = str(DATA_DIR / "L1_base_vocab.json")
|
| 51 |
+
|
| 52 |
+
# scoring: score = alpha * base_sim + beta * delta_scalar
|
| 53 |
+
alpha: float = 1.0
|
| 54 |
+
beta: float = 1.0
|
| 55 |
+
|
| 56 |
+
# retrieval
|
| 57 |
+
top_k: int = 16
|
| 58 |
+
|
| 59 |
+
# decode
|
| 60 |
+
temperature: float = 0.8
|
| 61 |
+
max_new_tokens: int = 64
|
| 62 |
+
|
| 63 |
+
## semanticizer
|
| 64 |
+
class VocabTokenizer:
|
| 65 |
+
"""
|
| 66 |
+
字串最大匹配 tokenizer。
|
| 67 |
+
|
| 68 |
+
設計目標:
|
| 69 |
+
將輸入文字拆成 vocab 中存在的 token。
|
| 70 |
+
|
| 71 |
+
方法:
|
| 72 |
+
- 使用最大長度優先匹配
|
| 73 |
+
- OOV 字元直接跳過
|
| 74 |
+
|
| 75 |
+
風險:
|
| 76 |
+
- OOV 會被忽略(可能導致語義缺失)
|
| 77 |
+
- 無 subword fallback
|
| 78 |
+
- 時間複雜度 O(n * max_token_len)
|
| 79 |
+
|
| 80 |
+
適用情境:
|
| 81 |
+
vocab 是字 / 詞 級別,且已對齊 embedding。
|
| 82 |
+
"""
|
| 83 |
+
def __init__(self, vocab_list):
|
| 84 |
+
self.vocab_set = set(vocab_list)
|
| 85 |
+
self.max_len = max(len(t) for t in vocab_list)
|
| 86 |
+
|
| 87 |
+
def tokenize(self, text: str):
|
| 88 |
+
tokens = []
|
| 89 |
+
i = 0
|
| 90 |
+
n = len(text)
|
| 91 |
+
|
| 92 |
+
while i < n:
|
| 93 |
+
matched = False
|
| 94 |
+
for L in range(self.max_len, 0, -1):
|
| 95 |
+
if i + L <= n:
|
| 96 |
+
piece = text[i:i+L]
|
| 97 |
+
if piece in self.vocab_set:
|
| 98 |
+
tokens.append(piece)
|
| 99 |
+
i += L
|
| 100 |
+
matched = True
|
| 101 |
+
break
|
| 102 |
+
if not matched:
|
| 103 |
+
i += 1 # 跳過 OOV
|
| 104 |
+
return tokens
|
| 105 |
+
|
| 106 |
+
class PipeOwlEngine:
|
| 107 |
+
"""
|
| 108 |
+
PipeOwl 幾何語義引擎核心。
|
| 109 |
+
|
| 110 |
+
設計哲學:
|
| 111 |
+
index = 語義場座標
|
| 112 |
+
|
| 113 |
+
emb[i] -> 詞向量
|
| 114 |
+
delta[i] -> 詞的場偏移量
|
| 115 |
+
vocab[i] -> 詞本身
|
| 116 |
+
|
| 117 |
+
核心流程:
|
| 118 |
+
text
|
| 119 |
+
↓
|
| 120 |
+
tokenize
|
| 121 |
+
↓
|
| 122 |
+
mean embedding
|
| 123 |
+
↓
|
| 124 |
+
score = alpha*base + beta*delta
|
| 125 |
+
↓
|
| 126 |
+
top-k
|
| 127 |
+
↓
|
| 128 |
+
decode
|
| 129 |
+
|
| 130 |
+
這是一個:
|
| 131 |
+
Field-based retrieval language system
|
| 132 |
+
"""
|
| 133 |
+
|
| 134 |
+
def __init__(self, cfg: PipeOwlConfig):
|
| 135 |
+
self.cfg = cfg
|
| 136 |
+
|
| 137 |
+
# Loaded assets
|
| 138 |
+
self.emb: np.ndarray = None # (V, D) float32
|
| 139 |
+
self.delta: np.ndarray = None # (V,) float32
|
| 140 |
+
self.token_to_id: Dict[str, int] = {}
|
| 141 |
+
self.id_to_token: List[str] = []
|
| 142 |
+
|
| 143 |
+
# Decoder (optional)
|
| 144 |
+
self.decoder = MicroGPTDecoder() # inference-only stub; plug your trained weights later
|
| 145 |
+
|
| 146 |
+
self._load_assets()
|
| 147 |
+
|
| 148 |
+
# -------------------------
|
| 149 |
+
# asset loading
|
| 150 |
+
# -------------------------
|
| 151 |
+
|
| 152 |
+
def _load_assets(self) -> None:
|
| 153 |
+
"""
|
| 154 |
+
載入語義場資產。
|
| 155 |
+
|
| 156 |
+
載入內容:
|
| 157 |
+
1. embeddings (V, D)
|
| 158 |
+
2. delta scalar (V,)
|
| 159 |
+
3. vocab list (V,)
|
| 160 |
+
|
| 161 |
+
關鍵假設:
|
| 162 |
+
三者必須 index 完全對齊。
|
| 163 |
+
|
| 164 |
+
幾何意義:
|
| 165 |
+
每個 index i 對應語義空間中的一個固定場點。
|
| 166 |
+
|
| 167 |
+
風險:
|
| 168 |
+
- vocab 長度不等於 embeddings
|
| 169 |
+
- delta 長度不等於 embeddings
|
| 170 |
+
- dtype 不一致
|
| 171 |
+
"""
|
| 172 |
+
if not os.path.exists(self.cfg.embeddings_path):
|
| 173 |
+
raise FileNotFoundError(self.cfg.embeddings_path)
|
| 174 |
+
if not os.path.exists(self.cfg.delta_scalar_path):
|
| 175 |
+
raise FileNotFoundError(self.cfg.delta_scalar_path)
|
| 176 |
+
if not os.path.exists(self.cfg.vocab_path):
|
| 177 |
+
raise FileNotFoundError(self.cfg.vocab_path)
|
| 178 |
+
|
| 179 |
+
# embeddings: (V, D)
|
| 180 |
+
self.emb = np.load(self.cfg.embeddings_path)
|
| 181 |
+
if self.emb.dtype != np.float32:
|
| 182 |
+
self.emb = self.emb.astype(np.float32, copy=False)
|
| 183 |
+
|
| 184 |
+
# delta: (V,)
|
| 185 |
+
self.delta = np.load(self.cfg.delta_scalar_path)
|
| 186 |
+
if self.delta.dtype != np.float32:
|
| 187 |
+
self.delta = self.delta.astype(np.float32, copy=False)
|
| 188 |
+
|
| 189 |
+
if self.emb.ndim != 2:
|
| 190 |
+
raise ValueError(f"embeddings must be 2D (V, D), got shape={self.emb.shape}")
|
| 191 |
+
V, D = self.emb.shape
|
| 192 |
+
|
| 193 |
+
if self.delta.ndim != 1 or self.delta.shape[0] != V:
|
| 194 |
+
raise ValueError(f"delta must be shape (V,), got {self.delta.shape}, expected ({V},)")
|
| 195 |
+
|
| 196 |
+
# vocab json: build token_to_id and id_to_token
|
| 197 |
+
with open(self.cfg.vocab_path, "r", encoding="utf-8") as f:
|
| 198 |
+
vocab_list = json.load(f)
|
| 199 |
+
|
| 200 |
+
if not isinstance(vocab_list, list):
|
| 201 |
+
raise ValueError("vocab must be a list for geometric field mode")
|
| 202 |
+
|
| 203 |
+
if len(vocab_list) != V:
|
| 204 |
+
raise ValueError(f"vocab size {len(vocab_list)} != embeddings V {V}")
|
| 205 |
+
|
| 206 |
+
self.vocab = vocab_list
|
| 207 |
+
self.id_to_token = vocab_list
|
| 208 |
+
self.token_to_id = {ch: i for i, ch in enumerate(vocab_list)}
|
| 209 |
+
|
| 210 |
+
self.tokenizer = VocabTokenizer(self.vocab)
|
| 211 |
+
|
| 212 |
+
# -------------------------
|
| 213 |
+
# encode (from vector library)
|
| 214 |
+
# -------------------------
|
| 215 |
+
|
| 216 |
+
def encode(self, text: str):
|
| 217 |
+
"""
|
| 218 |
+
將文字投影到語義場中。
|
| 219 |
+
|
| 220 |
+
流程:
|
| 221 |
+
1. tokenize -> token list
|
| 222 |
+
2. 取每個 token 對應 emb
|
| 223 |
+
3. 做 mean pooling
|
| 224 |
+
4. normalize
|
| 225 |
+
|
| 226 |
+
數學形式:
|
| 227 |
+
q = normalize( mean( emb[token_i] ) )
|
| 228 |
+
|
| 229 |
+
幾何意義:
|
| 230 |
+
這是在語義場中求質心。
|
| 231 |
+
|
| 232 |
+
風險:
|
| 233 |
+
- mean pooling 會削弱方向性
|
| 234 |
+
- 若 tokens 少或 OOV 多,向量會接近零
|
| 235 |
+
"""
|
| 236 |
+
tokens = self.tokenizer.tokenize(text)
|
| 237 |
+
|
| 238 |
+
vecs = []
|
| 239 |
+
for t in tokens:
|
| 240 |
+
idx = self.token_to_id[t]
|
| 241 |
+
vecs.append(self.emb[idx])
|
| 242 |
+
|
| 243 |
+
if not vecs:
|
| 244 |
+
return np.zeros(self.emb.shape[1], dtype=np.float32)
|
| 245 |
+
|
| 246 |
+
q = np.mean(vecs, axis=0)
|
| 247 |
+
q /= (np.linalg.norm(q) + 1e-12)
|
| 248 |
+
return q
|
| 249 |
+
|
| 250 |
+
# -------------------------
|
| 251 |
+
# loss / scoring (delta)
|
| 252 |
+
# -------------------------
|
| 253 |
+
def score_vocab(self, q: np.ndarray, alpha: Optional[float] = None, beta: Optional[float] = None) -> np.ndarray:
|
| 254 |
+
"""
|
| 255 |
+
計算每個 vocab token 的場分數。
|
| 256 |
+
|
| 257 |
+
base:
|
| 258 |
+
emb @ q
|
| 259 |
+
若 emb 與 q 已正規化,則為 cosine similarity。
|
| 260 |
+
|
| 261 |
+
delta:
|
| 262 |
+
每個 token 的靜態場偏移量。
|
| 263 |
+
|
| 264 |
+
最終公式:
|
| 265 |
+
score = alpha * base + beta * delta
|
| 266 |
+
|
| 267 |
+
目前語義:
|
| 268 |
+
delta 是 logit bias。
|
| 269 |
+
不是 loss、不是 energy gradient。
|
| 270 |
+
|
| 271 |
+
暫無實作
|
| 272 |
+
若 beta = 0:
|
| 273 |
+
純 embedding 相似度搜尋。
|
| 274 |
+
|
| 275 |
+
若 beta > 0:
|
| 276 |
+
加入場重力井效果。
|
| 277 |
+
"""
|
| 278 |
+
a = self.cfg.alpha if alpha is None else float(alpha)
|
| 279 |
+
b = self.cfg.beta if beta is None else float(beta)
|
| 280 |
+
|
| 281 |
+
base = self.emb @ q # (V,)
|
| 282 |
+
score = a * base + b * self.delta
|
| 283 |
+
return score.astype(np.float32, copy=False)
|
| 284 |
+
|
| 285 |
+
def topk(self, score: np.ndarray, k: Optional[int] = None) -> List[Tuple[str, float]]:
|
| 286 |
+
"""
|
| 287 |
+
取前 k 高分 token。
|
| 288 |
+
|
| 289 |
+
使用 argpartition 提升效率。
|
| 290 |
+
|
| 291 |
+
回傳:
|
| 292 |
+
[(token_string, score), ...]
|
| 293 |
+
|
| 294 |
+
幾何意義:
|
| 295 |
+
找出最接近 query 向量(含場偏移)的場點。
|
| 296 |
+
|
| 297 |
+
注意:
|
| 298 |
+
score 可能 > 1(因為加入 delta)。
|
| 299 |
+
"""
|
| 300 |
+
k = self.cfg.top_k if k is None else int(k)
|
| 301 |
+
k = max(1, min(k, score.shape[0]))
|
| 302 |
+
|
| 303 |
+
# argpartition for speed
|
| 304 |
+
idx = np.argpartition(-score, k - 1)[:k]
|
| 305 |
+
idx = idx[np.argsort(-score[idx])]
|
| 306 |
+
|
| 307 |
+
out = []
|
| 308 |
+
for i in idx:
|
| 309 |
+
tok = self.id_to_token[i] if i < len(self.id_to_token) else str(i)
|
| 310 |
+
out.append((tok, float(score[i])))
|
| 311 |
+
return out
|
| 312 |
+
|
| 313 |
+
# -------------------------
|
| 314 |
+
# decode (microgpt inference-only)
|
| 315 |
+
# -------------------------
|
| 316 |
+
def decode(self, prompt_tokens: List[str]) -> str:
|
| 317 |
+
"""
|
| 318 |
+
Decode 階段。
|
| 319 |
+
|
| 320 |
+
目前行為:
|
| 321 |
+
將 top tokens 拼成 prompt 字串,
|
| 322 |
+
丟給 microgpt stub。
|
| 323 |
+
|
| 324 |
+
設計定位:
|
| 325 |
+
retrieval 與 generation 分離。
|
| 326 |
+
|
| 327 |
+
現狀:
|
| 328 |
+
microgpt 尚未接上真實權重,
|
| 329 |
+
目前只是 pipeline 占位。
|
| 330 |
+
"""
|
| 331 |
+
|
| 332 |
+
prompt = " ".join([t for t in prompt_tokens if t])
|
| 333 |
+
return self.decoder.generate(
|
| 334 |
+
prompt=prompt,
|
| 335 |
+
temperature=self.cfg.temperature,
|
| 336 |
+
max_new_tokens=self.cfg.max_new_tokens,
|
| 337 |
+
)
|
| 338 |
+
|
| 339 |
+
# -------------------------
|
| 340 |
+
# one-shot pipeline
|
| 341 |
+
# -------------------------
|
| 342 |
+
def pipeowl(
|
| 343 |
+
self,
|
| 344 |
+
text: str,
|
| 345 |
+
*,
|
| 346 |
+
top_k: Optional[int] = None,
|
| 347 |
+
alpha: Optional[float] = None,
|
| 348 |
+
beta: Optional[float] = None,
|
| 349 |
+
temperature: Optional[float] = None,
|
| 350 |
+
max_new_tokens: Optional[int] = None,
|
| 351 |
+
) -> Dict[str, object]:
|
| 352 |
+
"""
|
| 353 |
+
單次完整 pipeline。
|
| 354 |
+
|
| 355 |
+
流程:
|
| 356 |
+
text
|
| 357 |
+
↓
|
| 358 |
+
encode
|
| 359 |
+
↓
|
| 360 |
+
score_vocab
|
| 361 |
+
↓
|
| 362 |
+
topk
|
| 363 |
+
↓
|
| 364 |
+
decode
|
| 365 |
+
|
| 366 |
+
回傳:
|
| 367 |
+
{
|
| 368 |
+
"query": 原始文字,
|
| 369 |
+
"retrieved": top-k token + 分數,
|
| 370 |
+
"prompt": 用於 decode 的 token 串,
|
| 371 |
+
"decoded": 生成結果
|
| 372 |
+
}
|
| 373 |
+
|
| 374 |
+
這是語義場查詢的一次完整觀測。
|
| 375 |
+
"""
|
| 376 |
+
q = self.encode(text)
|
| 377 |
+
s = self.score_vocab(q, alpha=alpha, beta=beta)
|
| 378 |
+
retrieved = self.topk(s, k=top_k)
|
| 379 |
+
|
| 380 |
+
# build a prompt from top tokens (simple & deterministic)
|
| 381 |
+
prompt_tokens = [t for (t, _) in retrieved[: min(len(retrieved), 8)]]
|
| 382 |
+
if temperature is not None:
|
| 383 |
+
self.cfg.temperature = float(temperature)
|
| 384 |
+
if max_new_tokens is not None:
|
| 385 |
+
self.cfg.max_new_tokens = int(max_new_tokens)
|
| 386 |
+
|
| 387 |
+
decoded = self.decode(prompt_tokens)
|
| 388 |
+
return {
|
| 389 |
+
"query": text,
|
| 390 |
+
"retrieved": retrieved,
|
| 391 |
+
"prompt": " ".join(prompt_tokens),
|
| 392 |
+
"decoded": decoded,
|
| 393 |
+
}
|
| 394 |
+
|
| 395 |
+
|
| 396 |
+
# ----------------------------------------------------------------------
|
| 397 |
+
# microgpt inference-only stub
|
| 398 |
+
# ----------------------------------------------------------------------
|
| 399 |
+
class MicroGPTDecoder:
|
| 400 |
+
"""
|
| 401 |
+
推理階段占位 decoder。
|
| 402 |
+
|
| 403 |
+
設計目的:
|
| 404 |
+
讓 pipeline 可運行,
|
| 405 |
+
未來可替換為:
|
| 406 |
+
- 已訓練 microGPT
|
| 407 |
+
- 外部 LLM
|
| 408 |
+
- 或場驅動 sampling 模型
|
| 409 |
+
|
| 410 |
+
現在只是 scaffold。
|
| 411 |
+
|
| 412 |
+
Inference-only placeholder.
|
| 413 |
+
|
| 414 |
+
Why placeholder?
|
| 415 |
+
- Your pasted microGPT file trains its own weights in-process.
|
| 416 |
+
- For a real decode stage, you want:
|
| 417 |
+
(A) load a trained state_dict from disk, OR
|
| 418 |
+
(B) keep a tiny trained model in memory, OR
|
| 419 |
+
(C) use microGPT purely as a sampler over a learned char vocab.
|
| 420 |
+
|
| 421 |
+
This class is the stable interface. Plug your implementation later.
|
| 422 |
+
"""
|
| 423 |
+
|
| 424 |
+
def __init__(self):
|
| 425 |
+
# If you already have trained weights, add:
|
| 426 |
+
# self.state_dict = load(...)
|
| 427 |
+
pass
|
| 428 |
+
|
| 429 |
+
def generate(self, prompt: str, temperature: float = 0.8, max_new_tokens: int = 64) -> str:
|
| 430 |
+
# Minimal safe fallback: return prompt as “decoded” scaffold.
|
| 431 |
+
# Replace this with your microgpt forward+sampling once you have weights.
|
| 432 |
+
# (This keeps the pipeline callable today.)
|
| 433 |
+
return f"[microgpt_stub] {prompt}"
|
model_card.md
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## Model Type
|
| 2 |
+
Geometric embedding field (non-neural, transformer-free)
|
| 3 |
+
|
| 4 |
+
## Parameter Size
|
| 5 |
+
~165M embedding parameters (static matrix)
|
| 6 |
+
|
| 7 |
+
## Intended Use
|
| 8 |
+
- Semantic similarity
|
| 9 |
+
- Lightweight retrieval
|
| 10 |
+
- Geometric experimentation
|
| 11 |
+
|
| 12 |
+
## Limitations
|
| 13 |
+
- No contextual modeling
|
| 14 |
+
- No token interaction modeling
|
| 15 |
+
- Domain performance varies
|
quickstart.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .engine import PipeOwlEngine, PipeOwlConfig
|
| 2 |
+
|
| 3 |
+
engine = PipeOwlEngine(PipeOwlConfig())
|
| 4 |
+
|
| 5 |
+
while True:
|
| 6 |
+
query = input("請輸入句子: ")
|
| 7 |
+
|
| 8 |
+
out = engine.pipeowl(query, top_k=5)
|
| 9 |
+
|
| 10 |
+
print("\nTop-K Tokens:")
|
| 11 |
+
for text, score in out["retrieved"]:
|
| 12 |
+
print(f"{score:.3f} | {text}")
|
| 13 |
+
|
| 14 |
+
print("\nDecoded:")
|
| 15 |
+
print(out["decoded"])
|
| 16 |
+
print()
|