Files changed (10) hide show
  1. LICENSE +21 -0
  2. README.md +90 -0
  3. ai_score.py +199 -0
  4. config.json +12 -0
  5. engine.py +601 -0
  6. example.md +63 -0
  7. pipeowl.safetensors +3 -0
  8. ptt.npy +3 -0
  9. quickstart.py +38 -0
  10. tokenizer.json +0 -0
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2026 galaxy4552
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,3 +1,93 @@
1
  ---
 
 
 
 
 
 
 
 
 
2
  license: mit
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ language:
3
+ - zh
4
+ - en
5
+ tags:
6
+ - embeddings
7
+ - retrieval
8
+ - transformer-free
9
+ - safetensors
10
+ - edge-ai
11
  license: mit
12
  ---
13
+
14
+ # CleanOwl-0.1
15
+
16
+ **I hate AI-SLOP SO I MADE THIS.**
17
+
18
+ CleanOwl is a lightweight human-likeness scoring engine.
19
+
20
+ It detects whether a sentence feels like a natural human message or AI-generated content, using:
21
+
22
+ - token distribution irregularity
23
+ - semantic continuity
24
+ - punctuation behavior
25
+
26
+ No transformer. No fine-tuning. Pure statistical signals.
27
+
28
+ ## Score Interpretation
29
+
30
+ | Score | Meaning |
31
+ |------|--------|
32
+ | < 60 | Likely AI-generated / formal text |
33
+ | 60–75 | Mixed / ambiguous |
34
+ | > 75 | Likely human-like message |
35
+
36
+ Note: This is not a classifier, but a heuristic scoring system.
37
+
38
+ ## Limitations
39
+
40
+ - Short sentences may be misclassified
41
+ - Highly polished human writing (e.g. essays) may look like AI
42
+ - AI can sometimes mimic human irregularity
43
+
44
+ This is a lightweight detector, not a definitive AI classifier.
45
+
46
+ ## Quickstart
47
+
48
+ ```bash
49
+ git clone https://huggingface.co/WangKaiLin/CleanOwl-0.1
50
+ cd CleanOwl-0.1
51
+
52
+ pip install numpy safetensors
53
+
54
+ python ai_score.py
55
+
56
+ # or embedding entry
57
+ python quickstart.py
58
+ ```
59
+
60
+ ## Example:
61
+
62
+
63
+ ```bash
64
+ 請輸入文字:先思考:在 AI 時代,什麼樣的人才不會被取代?我的答案是:具備溝通能力的人、擁有韌性的人,以及始終願意站在第一線的人。
65
+
66
+ human score: 47.13
67
+ label: ai_slop_like
68
+
69
+ 請輸入文字:身為專業的肥宅 都會把脂肪放在身上
70
+
71
+ human score: 76.88
72
+ label: maybe_human_like
73
+ ```
74
+
75
+ ## Repository Structure
76
+
77
+ ```bash
78
+ CleanOwl-0.1/
79
+ ├─ ai_score.py # human score / ai slop score
80
+ ├─ quickstart.py # demo CLI
81
+ ├─ engine.py # PipeOwl tokenizer + emb loader
82
+ ├─ pipeowl.safetensors # embeddings + delta_field
83
+ ├─ tokenizer.json
84
+ ├─ ptt.npy # style field
85
+ ├─ config.json
86
+ ├─ README.md
87
+ ├─ example.md
88
+ └─ LICENSE
89
+ ```
90
+
91
+ ## LICENSE
92
+
93
+ MIT
ai_score.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ptt_score.py
2
+
3
+ from pathlib import Path
4
+ import numpy as np
5
+
6
+ from engine import PipeOwlEngine, PipeOwlConfig
7
+
8
+ BASE_DIR = Path(__file__).resolve().parent
9
+ FIELD_PATH = BASE_DIR / "ptt.npy"
10
+
11
+ PUNCT = set(",。!?、;:,.!?;:()()[]【】「」『』《》〈〉\"'`~…—-_ ")
12
+ STOP = set("的一是在有和人不我他你它這那就都也很到說要會可以的了嗎啊吧啦喔")
13
+ PUNCT_STRONG = set(",。;:「」『』()()、,.!?!?:;")
14
+ PUNCT_FORMAL = set(",。;:「」『』()()、,.!:;")
15
+ CASUAL_PUNCT = set("!?!?~~=wW哈ㄏXDxd.")
16
+ FORMAL_PUNCT = set(",。;:「」『』()()、,:;")
17
+
18
+ def is_valid_style_token(tok: str) -> bool:
19
+ tok = tok.strip()
20
+ if not tok:
21
+ return False
22
+
23
+ # 標點不算分
24
+ if all(ch in PUNCT for ch in tok):
25
+ return False
26
+
27
+ # 單字常見虛詞不算分
28
+ if len(tok) == 1 and tok in STOP:
29
+ return False
30
+
31
+ return True
32
+
33
+ class PTTScorer:
34
+ def __init__(self):
35
+ self.engine = PipeOwlEngine(PipeOwlConfig())
36
+ self.field = np.load(FIELD_PATH).astype(np.float32)
37
+
38
+ def score(self, text: str):
39
+ tokens = self.engine.tokenizer.tokenize(text)
40
+
41
+ vals = []
42
+ used = []
43
+ vecs = []
44
+
45
+ chars = [ch for ch in text if not ch.isspace()]
46
+ punct_count = sum(ch in PUNCT_STRONG for ch in chars)
47
+ formal_punct_count = sum(ch in PUNCT_FORMAL for ch in chars)
48
+
49
+ punct_ratio = punct_count / max(1, len(chars))
50
+ formal_punct_ratio = formal_punct_count / max(1, len(chars))
51
+
52
+ paren_count = text.count("(") + text.count(")") + text.count("(") + text.count(")")
53
+ quote_count = text.count("「") + text.count("」") + text.count('"') + text.count("'")
54
+
55
+ chars = [ch for ch in text if not ch.isspace()]
56
+
57
+ casual_punct_count = sum(ch in CASUAL_PUNCT for ch in chars)
58
+ formal_punct_count = sum(ch in FORMAL_PUNCT for ch in chars)
59
+
60
+ casual_punct_ratio = casual_punct_count / max(1, len(chars))
61
+ formal_punct_ratio = formal_punct_count / max(1, len(chars))
62
+
63
+ formal_types = set(ch for ch in chars if ch in FORMAL_PUNCT)
64
+ casual_types = set(ch for ch in chars if ch in CASUAL_PUNCT)
65
+
66
+ for tok in tokens:
67
+ idx = self.engine.token_to_id.get(tok)
68
+ if idx is None:
69
+ continue
70
+
71
+ if idx is not None:
72
+ vecs.append(self.engine.emb[idx])
73
+
74
+ val = float(self.field[idx])
75
+
76
+ if not is_valid_style_token(tok):
77
+ used.append((tok, val, "ignored"))
78
+ continue
79
+
80
+ vals.append(val)
81
+ used.append((tok, val, "used"))
82
+
83
+ sim_diffs = []
84
+
85
+ for i in range(len(vecs) - 1):
86
+ v1 = vecs[i]
87
+ v2 = vecs[i + 1]
88
+ sim = float(np.dot(v1, v2))
89
+ sim_diffs.append(sim)
90
+
91
+ if sim_diffs:
92
+ continuity = float(np.mean(sim_diffs))
93
+ else:
94
+ continuity = 0.0
95
+
96
+ if not vals:
97
+ return {
98
+ "score": 0.0,
99
+ "label": "unknown",
100
+ "tokens": []
101
+ }
102
+
103
+ # 平均值:整段文字像不像 PTT
104
+ vals = np.array(vals, dtype=np.float32)
105
+
106
+ mean = float(np.mean(vals))
107
+ var = float(np.var(vals))
108
+ peak = float(np.max(vals) - mean)
109
+
110
+ lengths = np.array([len(tok) for tok, *_ in used if _[-1] != "ignored"], dtype=np.float32)
111
+ len_var = float(np.var(lengths)) if len(lengths) > 0 else 0.0
112
+
113
+ raw_score = (
114
+ mean
115
+ + 0.30 * var
116
+ + 0.20 * peak
117
+ + 0.10 * len_var
118
+ - 4.0 * continuity
119
+ )
120
+
121
+ clean_structure = 1.0 if (len_var > 3.0 and var > 2.0 and continuity > 0.12) else 0.0
122
+ raw_score -= 1.2 * clean_structure
123
+
124
+ # 標點/格式懲罰
125
+ raw_score -= 10.0 * formal_punct_ratio
126
+ raw_score -= 0.25 * paren_count
127
+ raw_score -= 0.20 * quote_count
128
+
129
+ # 口語標點不扣,少量加分
130
+ if casual_punct_ratio > 0:
131
+ raw_score += min(0.8, casual_punct_ratio * 3.0)
132
+
133
+ # 如果只有一種口語標點,而且重複很多,視為人類口語
134
+ if len(casual_types) == 1 and casual_punct_count >= 2:
135
+ raw_score += 0.7
136
+
137
+ # 如果正式標點種類很多,像文章/AI
138
+ if len(formal_types) >= 3:
139
+ raw_score -= 0.8
140
+
141
+ # 只有一組「,」「。」不重扣
142
+ if formal_punct_count <= 2 and formal_types.issubset({",", "。"}):
143
+ raw_score += 0.3
144
+
145
+ # 轉成 0~100 分
146
+ score_0_100 = (raw_score - 3.0) * 12 + 55
147
+ score_0_100 = max(0, min(100, score_0_100))
148
+
149
+ if score_0_100 >= 75:
150
+ label = "human_like"
151
+ elif score_0_100 >= 60:
152
+ label = "maybe_human_like"
153
+ else:
154
+ label = "ai_slop_like"
155
+
156
+ return {
157
+ "score": round(score_0_100, 2),
158
+ "raw": round(raw_score, 4),
159
+ "mean": round(mean, 4),
160
+ "var": round(var, 4),
161
+ "peak": round(peak, 4),
162
+ "len_var": round(len_var, 4),
163
+ "continuity": round(continuity, 4),
164
+ "punct_ratio": round(punct_ratio, 4),
165
+ "formal_punct_ratio": round(formal_punct_ratio, 4),
166
+ "label": label,
167
+ "tokens": used,
168
+ }
169
+
170
+
171
+ if __name__ == "__main__":
172
+ scorer = PTTScorer()
173
+
174
+ while True:
175
+ text = input("\n請輸入文字:").strip()
176
+ if text.lower() in {"exit", "quit"}:
177
+ break
178
+
179
+ out = scorer.score(text)
180
+
181
+ print("\nhuman score:", out["score"])
182
+ print("label:", out["label"])
183
+
184
+ print("mean:", out["mean"])
185
+ print("var:", out["var"])
186
+ print("peak:", out["peak"])
187
+ print("len_var:", out["len_var"])
188
+ print("continuity:", out["continuity"])
189
+ print("punct_ratio:", out["punct_ratio"])
190
+ print("formal_punct_ratio:", out["formal_punct_ratio"])
191
+
192
+ print("\nTokens:")
193
+ for item in out["tokens"]:
194
+ if len(item) == 3:
195
+ tok, val, flag = item
196
+ print(f"{val:.3f} | {flag:7} | {tok}")
197
+ else:
198
+ tok, val = item
199
+ print(f"{val:.3f} | {tok}")
config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "cleanowl",
3
+ "version": "0.1",
4
+ "base_engine": "pipeowl",
5
+ "task": "human_likeness_scoring",
6
+ "field_path": "ptt.npy",
7
+ "model_path": "pipeowl.safetensors",
8
+ "tokenizer_path": "tokenizer.json",
9
+ "architecture": "semantic-field-retrieval",
10
+ "embedding_dim": 256,
11
+ "vocab_size": 524190
12
+ }
engine.py ADDED
@@ -0,0 +1,601 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PipeOwl engine: a transformer-free retrieval core based on
3
+ a static embedding field + delta field scoring.
4
+
5
+ This module is the retrieval backbone of the project.
6
+ Current implementation focuses on:
7
+ - vocabulary encoding
8
+ - field-based scoring
9
+ - top-k retrieval
10
+ - lightweight decode stub
11
+
12
+ NOTE:
13
+ Some comments below also describe design directions that are
14
+ not fully implemented yet.
15
+ """
16
+ ## -----------------------------------------------------------------------------
17
+ ## Design Notes / Future Work
18
+ ## -----------------------------------------------------------------------------
19
+ ##
20
+ ##這是使用笛卡兒座標做的embedding模型
21
+ ##在傳統QKV模型中
22
+ ##只保留了V
23
+ ##QK已簡化為delta field
24
+ ##
25
+ ##目前只保留最精簡的骨幹
26
+ ##來做為各個方向修復的彈性
27
+ ##
28
+ # TODO:
29
+ # - improve tokenizer behavior
30
+ # - explore gate-based score mode
31
+ # - evaluate trainable decode stage
32
+ #
33
+ ##如果自行訓練成LLM:
34
+ ##1.TOKEN NLL目前是13 離SOTA能力約500倍 但速度上壓到人類可接受的速度
35
+ ## 可以在CPU環境中可以把delta field訓練到7
36
+ ##2.TOKENIZER在邏輯上還有問題
37
+ ##3.SCORE MODE剛想到新的方式:GATE
38
+ ## 然後再用lose訓練"GATE" -> (1 - α*gate)*base + α*delta
39
+ ##
40
+ ##如果想使用在IME:
41
+ ##base在幾何上的意義是: 在多維空間中最靠近你INPUT的座標文字
42
+ ##delta field在幾何上的意義是: 每個詞的推論意義能力(有點類似ngram)
43
+ ##所以在應用場景內
44
+ ##要找意義相近的詞:base調大一點
45
+ ##要找下一個詞:delta field調大一點
46
+ ##所以在SCORE MODE可以選擇residual來達到平衡
47
+ #
48
+ # FIXME:
49
+ # - comments may describe future design, not only current implementation
50
+ ## -----------------------------------------------------------------------------
51
+
52
+ from __future__ import annotations
53
+
54
+ import json
55
+ import os
56
+ import re
57
+ import math
58
+ from dataclasses import dataclass
59
+ from safetensors.numpy import load_file # type: ignore
60
+ from typing import Dict, List, Tuple, Optional
61
+ import numpy as np # type: ignore
62
+ from pathlib import Path
63
+
64
+
65
+ BASE_DIR = Path(__file__).resolve()
66
+ data = load_file("pipeowl.safetensors")
67
+
68
+ @dataclass
69
+ class PipeOwlConfig:
70
+ """
71
+ 全域設定。
72
+
73
+ embeddings_path:
74
+ 語義場的基底向量矩陣 (V, D)
75
+ V = 詞彙數
76
+ D = 向量維度
77
+
78
+ delta_scalar_path:
79
+ 每個 token 對應的一維場偏移量 (V,)
80
+ 用來做 score 偏移(目前為靜態 bias)
81
+
82
+ vocab_path:
83
+ vocab list,必須與 embeddings 順序完全對齊。
84
+ index i <-> emb[i] <-> delta[i]
85
+
86
+ alpha:
87
+ base 相似度權重
88
+
89
+ beta:
90
+ delta 權重(目前為 logit bias,不是動態 loss)
91
+
92
+ top_k:
93
+ retrieval 預設回傳數量
94
+
95
+ temperature:
96
+ decode 階段採樣溫度
97
+
98
+ max_new_tokens:
99
+ decode 最大生成長度
100
+ """
101
+ ROOT_DIR = BASE_DIR.parent
102
+ vocab_path: str = str(ROOT_DIR / "tokenizer.json")
103
+
104
+ normalize_rows: bool = False # True: enforce row-wise normalization for cosine==dot
105
+ ensure_contiguous: bool = True # True: make emb contiguous for faster GEMV
106
+ max_token_len_cap: int = 32 # cap tokenizer max token length to prevent slow path / garbage vocab
107
+
108
+ #=============================
109
+ alpha: float = 1
110
+ #=============================
111
+
112
+ #=============================
113
+ # scoring mode
114
+ #=============================
115
+ score_mode: str = "residual"
116
+ # options:
117
+ # "linear" -> α*base + β*delta + γ*syntax
118
+ # "residual" -> α*base + (1 - α*base)*delta
119
+ #=============================
120
+
121
+ ##=============================
122
+ ## "linear"
123
+ ## score = α*base + β*delta + γ*syntax
124
+ ##=============================
125
+ beta: float = 0.05
126
+ ##gamma: float = 1.5
127
+ ##=============================
128
+ ## if linear
129
+ ## α=0.97 β=0.03 performance well
130
+ ## α=1 β=0.00 just same as model
131
+ ##=============================
132
+
133
+ ##=============================
134
+ ## "residual"
135
+ ## score = α*base + (1 - α*base)*delta
136
+ ##=============================
137
+ ## if residual
138
+ ## α=1 just same as model
139
+ ## α=0.9 performance well
140
+ ## α=0.5 find more meaning
141
+ ##=============================
142
+
143
+ ##=============================
144
+ ## retrieval
145
+ ##=============================
146
+ top_k: int = 16
147
+ ##=============================
148
+
149
+ ##=============================
150
+ ## decode
151
+ ##=============================
152
+ temperature: float = 0.13
153
+ ##=============================
154
+ ## temperature = 0.13 performance well
155
+ ##=============================
156
+ max_new_tokens: int = 64
157
+ ##=============================
158
+
159
+ """
160
+ def softmax(scores, temperature=0.3):
161
+ scores = np.array(scores) / temperature
162
+ exp = np.exp(scores - np.max(scores))
163
+ return exp / exp.sum()
164
+ """
165
+
166
+ def eval_token_nll(engine, text):
167
+ tokens = engine.tokenizer.tokenize(text)
168
+ if len(tokens) < 2:
169
+ return float("inf")
170
+
171
+ total_bits = 0.0
172
+ count = 0
173
+
174
+ for i in range(len(tokens) - 1):
175
+ context = "".join(tokens[:i+1])
176
+ target_token = tokens[i+1]
177
+
178
+ q = engine.encode(context)
179
+ logits = engine.score_vocab(q)
180
+ probs = engine.logits_to_probs(logits)
181
+
182
+ idx = engine.token_to_id.get(target_token)
183
+ p = float(probs[idx]) if idx is not None else 1e-9
184
+ p = max(p, 1e-9)
185
+
186
+ total_bits += -math.log2(p)
187
+ count += 1
188
+
189
+ return total_bits / count
190
+
191
+ ## semanticizer
192
+ class VocabTokenizer:
193
+ """
194
+ 字串最大匹配 tokenizer。
195
+
196
+ 設計目標:
197
+ 將輸入文字拆成 vocab 中存在的 token。
198
+
199
+ 方法:
200
+ - 使用最大長度優先匹配
201
+
202
+ 適用情境:
203
+ vocab 是字 / 詞 級別,且已對齊 embedding。
204
+ """
205
+ def __init__(self, vocab_list, *, max_len_cap: Optional[int] = None):
206
+ self.vocab_set = set(vocab_list)
207
+
208
+ mx = max(len(t) for t in vocab_list)
209
+ if max_len_cap is not None:
210
+ mx = min(mx, int(max_len_cap))
211
+ self.max_len = mx
212
+
213
+ def tokenize(self, text):
214
+ text = text.lower().strip()
215
+
216
+ tokens = []
217
+ i = 0
218
+ n = len(text)
219
+
220
+ while i < n:
221
+ matched = False
222
+
223
+ for L in range(self.max_len, 0, -1):
224
+ if i + L <= n:
225
+ piece = text[i:i+L]
226
+
227
+ if piece in self.vocab_set:
228
+ tokens.append(piece)
229
+ i += L
230
+ matched = True
231
+ break
232
+
233
+ if not matched:
234
+ # 🔥 fallback char(最後才做)
235
+ tokens.append(text[i])
236
+ i += 1
237
+
238
+ return tokens
239
+
240
+ class PipeOwlEngine:
241
+ """
242
+ PipeOwl 幾何語義引擎核心。
243
+
244
+ 設計哲學:
245
+ index = 語義場座標
246
+
247
+ emb[i] -> 詞向量
248
+ delta[i] -> 詞的場偏移量
249
+ vocab[i] -> 詞本身
250
+
251
+ 核心流程:
252
+ text
253
+
254
+ tokenize
255
+
256
+ mean embedding
257
+
258
+ score = alpha*base + beta*delta
259
+
260
+ top-k
261
+
262
+ decode
263
+
264
+ 這是一個:
265
+ Field-based retrieval language system
266
+ """
267
+
268
+ def __init__(self, cfg: PipeOwlConfig):
269
+ self.cfg = cfg
270
+
271
+ #self.emb: np.ndarray = None # (V, D) float32
272
+ #self.delta: np.ndarray = None # (V,) float32
273
+ self.emb = data["embeddings"].astype(np.float32)
274
+ self.delta = data["delta_field"].astype(np.float32)
275
+ self.token_to_id: Dict[str, int] = {}
276
+ self.id_to_token: List[str] = []
277
+
278
+ # Decoder (optional)
279
+ self.decoder = MicroGPTDecoder() # inference-only stub; plug your trained weights later
280
+
281
+ self._load_assets()
282
+
283
+ # -------------------------
284
+ # asset loading
285
+ # -------------------------
286
+
287
+ def _load_assets(self) -> None:
288
+ """
289
+ 載入語義場資產。
290
+
291
+ 載入內容:
292
+ 1. embeddings (V, D)
293
+ 2. delta scalar (V,)
294
+ 3. vocab list (V,)
295
+
296
+ 關鍵假設:
297
+ 三者必須 index 完全對齊。
298
+
299
+ 幾何意義:
300
+ 每個 index i 對應語義空間中的一個固定場點。
301
+
302
+ """
303
+ if not os.path.exists(self.cfg.vocab_path):
304
+ raise FileNotFoundError(self.cfg.vocab_path)
305
+
306
+ emb = self.emb
307
+
308
+ # embeddings: (V, D)
309
+
310
+ if emb.dtype != np.float32:
311
+ emb = emb.astype(np.float32, copy=False)
312
+
313
+ # ChatGPT note: make C-contiguous for faster GEMV
314
+ if self.cfg.ensure_contiguous and not emb.flags["C_CONTIGUOUS"]:
315
+ emb = np.ascontiguousarray(emb)
316
+
317
+ if self.cfg.normalize_rows:
318
+ norms = np.linalg.norm(emb, axis=1, keepdims=True) + 1e-12
319
+ emb = emb / norms
320
+
321
+ # delta: (V,)
322
+ self.delta = data["delta_field"]
323
+ if self.delta.dtype != np.float32:
324
+ self.delta = self.delta.astype(np.float32, copy=False)
325
+
326
+ if self.emb.ndim != 2:
327
+ raise ValueError(f"embeddings must be 2D (V, D), got shape={self.emb.shape}")
328
+
329
+ # (V, D)
330
+ V, _ = self.emb.shape
331
+
332
+ if self.delta.ndim != 1 or self.delta.shape[0] != V:
333
+ raise ValueError(f"delta must be shape (V,), got {self.delta.shape}, expected ({V},)")
334
+
335
+ # vocab json: build token_to_id and id_to_token
336
+ with open(self.cfg.vocab_path, "r", encoding="utf-8-sig") as f:
337
+ vocab_list = json.load(f)
338
+
339
+ if not isinstance(vocab_list, list):
340
+ raise ValueError("vocab must be a list for geometric field mode")
341
+
342
+ if len(vocab_list) != V:
343
+ raise ValueError(f"vocab size {len(vocab_list)} != embeddings V {V}")
344
+
345
+ self.vocab = vocab_list
346
+ self.id_to_token = vocab_list
347
+ self.token_to_id = {t: i for i, t in enumerate(vocab_list)}
348
+
349
+ self.tokenizer = VocabTokenizer(self.vocab)
350
+
351
+ # -------------------------
352
+ # encode (from vector library)
353
+ # -------------------------
354
+
355
+ def encode(self, text: str):
356
+ """
357
+ 將文字投影到語義場中。
358
+
359
+ 流程:
360
+ 1. tokenize -> token list
361
+ 2. 取每個 token 對應 emb
362
+ 3. 做 mean pooling
363
+ 4. normalize
364
+
365
+ 數學形式:
366
+ q = normalize( mean( emb[token_i] ) )
367
+
368
+ 幾何意義:
369
+ 這是在語義場中求質心。
370
+
371
+ 風險:
372
+ - mean pooling 會削弱方向性
373
+ """
374
+ # ChatGPT note: exact token fast-path (prevents "貓頭鷹 = mean(貓,頭,鷹)" pollution)
375
+ idx0 = self.token_to_id.get(text)
376
+ if idx0 is not None:
377
+ v = self.emb[idx0].astype(np.float32, copy=False)
378
+ # emb rows already normalized if cfg.normalize_rows=True; keep safe anyway:
379
+ v = v / (np.linalg.norm(v) + 1e-12)
380
+ return v
381
+
382
+ tokens = self.tokenizer.tokenize(text)
383
+ if not tokens:
384
+ return np.zeros(self.emb.shape[1], dtype=np.float32)
385
+
386
+ vecs = []
387
+ wts = []
388
+
389
+ for t in tokens:
390
+ idx = self.token_to_id.get(t)
391
+ if idx is None:
392
+ continue
393
+
394
+ vecs.append(self.emb[idx])
395
+ wts.append(max(1, len(t)))
396
+
397
+ if not vecs:
398
+ return np.zeros(self.emb.shape[1], dtype=np.float32)
399
+
400
+ vecs = np.stack(vecs, axis=0).astype(np.float32, copy=False)
401
+ wts = np.asarray(wts, dtype=np.float32)
402
+ q = np.average(vecs, axis=0, weights=wts)
403
+ q /= (np.linalg.norm(q) + 1e-12)
404
+ return q
405
+
406
+ # -------------------------
407
+ # probs (decode)
408
+ # -------------------------
409
+
410
+ def logits_to_probs(self, logits: np.ndarray, temperature: Optional[float] = None) -> np.ndarray:
411
+ T = self.cfg.temperature if temperature is None else float(temperature)
412
+ x = logits.astype(np.float64) / max(T, 1e-8)
413
+ x = x - np.max(x)
414
+ exp_x = np.exp(x)
415
+ return (exp_x / np.sum(exp_x)).astype(np.float32)
416
+
417
+ # -------------------------
418
+ # loss / scoring (delta)
419
+ # -------------------------
420
+ def score_vocab(self, q: np.ndarray, alpha: Optional[float] = None, beta: Optional[float] = None) -> np.ndarray:
421
+ """
422
+ 計算每個 vocab token 的場分數。
423
+
424
+ base:
425
+ emb @ q
426
+ 若 emb 與 q 已正規化,則為 cosine similarity。
427
+
428
+ delta:
429
+ 每個 token 的靜態場偏移量。
430
+
431
+ 目前語義:
432
+ delta 是 logit bias。
433
+ 不是 loss、不是 energy gradient。s
434
+
435
+ """
436
+ a = self.cfg.alpha if alpha is None else float(alpha)
437
+ b = self.cfg.beta if beta is None else float(beta)
438
+
439
+ base = self.emb @ q
440
+
441
+ if self.cfg.score_mode == "linear":
442
+ score = a * base + b * self.delta
443
+
444
+ elif self.cfg.score_mode == "residual":
445
+ score = a * base + (1 - a * base) * self.delta
446
+
447
+ else:
448
+ raise ValueError(f"Unknown score_mode: {self.cfg.score_mode}")
449
+
450
+ return score.astype(np.float32, copy=False)
451
+
452
+ def topk(self, score: np.ndarray, k: Optional[int] = None) -> List[Tuple[str, float]]:
453
+ """
454
+ 取前 k 高分 token。
455
+
456
+ 使用 argpartition 提升效率。
457
+
458
+ 回傳:
459
+ [(token_string, score), ...]
460
+
461
+ 幾何意義:
462
+ 找出最接近 query 向量(含場偏移)的場點。
463
+
464
+ 注意:
465
+ score 可能 > 1(因為加入 delta)。
466
+ """
467
+ k = self.cfg.top_k if k is None else int(k)
468
+ k = max(1, min(k, score.shape[0]))
469
+
470
+ # argpartition for speed
471
+ idx = np.argpartition(-score, k - 1)[:k]
472
+ idx = idx[np.argsort(-score[idx])]
473
+
474
+ out = []
475
+ for i in idx:
476
+ tok = self.id_to_token[i] if i < len(self.id_to_token) else str(i)
477
+ out.append((tok, float(score[i])))
478
+ return out
479
+
480
+ # -------------------------
481
+ # decode (microgpt inference-only)
482
+ # -------------------------
483
+ def decode(self, prompt_tokens: List[str]) -> str:
484
+ """
485
+ Decode 階段。
486
+
487
+ 目前行為:
488
+ 將 top tokens 拼成 prompt 字串,
489
+ 丟給 microgpt stub。
490
+
491
+ 設計定位:
492
+ retrieval 與 generation 分離。
493
+
494
+ 現狀:
495
+ microgpt 尚未接上真實權重,
496
+ 目前只是 pipeline 占位。
497
+ """
498
+
499
+ prompt = " ".join([t for t in prompt_tokens if t])
500
+ return self.decoder.generate(
501
+ prompt=prompt,
502
+ temperature=self.cfg.temperature,
503
+ max_new_tokens=self.cfg.max_new_tokens,
504
+ )
505
+
506
+ # -------------------------
507
+ # one-shot pipeline
508
+ # -------------------------
509
+ def pipeowl(
510
+ self,
511
+ text: str,
512
+ *,
513
+ top_k: Optional[int] = None,
514
+ alpha: Optional[float] = None,
515
+ beta: Optional[float] = None,
516
+ temperature: Optional[float] = None,
517
+ max_new_tokens: Optional[int] = None,
518
+ ) -> Dict[str, object]:
519
+ """
520
+ 單次完整 pipeline。
521
+
522
+ 流程:
523
+ text
524
+
525
+ encode
526
+
527
+ score_vocab
528
+
529
+ topk
530
+
531
+ decode
532
+
533
+ 回傳:
534
+ {
535
+ "query": 原始文字,
536
+ "retrieved": top-k token + 分數,
537
+ "prompt": 用於 decode 的 token 串,
538
+ "decoded": 生成結果
539
+ }
540
+
541
+ 這是語義場查詢的一次完整觀測。
542
+ """
543
+
544
+ q = self.encode(text)
545
+ s = self.score_vocab(q, alpha=alpha, beta=beta)
546
+ retrieved = self.topk(s, k=top_k)
547
+
548
+ # build a prompt from top tokens (simple & deterministic)
549
+ prompt_tokens = [t for (t, _) in retrieved[: min(len(retrieved), 8)]]
550
+ if temperature is not None:
551
+ self.cfg.temperature = float(temperature)
552
+ if max_new_tokens is not None:
553
+ self.cfg.max_new_tokens = int(max_new_tokens)
554
+
555
+ decoded = self.decode(prompt_tokens)
556
+ return {
557
+ "query": text,
558
+ "retrieved": retrieved,
559
+ "prompt": " ".join(prompt_tokens),
560
+ "decoded": decoded,
561
+ }
562
+
563
+
564
+ # ----------------------------------------------------------------------
565
+ # microgpt inference-only stub
566
+ # ----------------------------------------------------------------------
567
+ class MicroGPTDecoder:
568
+ """
569
+ 推理階段占位 decoder。
570
+
571
+ 設計目的:
572
+ 讓 pipeline 可運行,
573
+ 未來可替換為:
574
+ - 已訓練 microGPT
575
+ - 外部 LLM
576
+ - 或場驅動 sampling 模型
577
+
578
+ 現在只是 scaffold。
579
+
580
+ Inference-only placeholder.
581
+
582
+ Why placeholder?
583
+ - Your pasted microGPT file trains its own weights in-process.
584
+ - For a real decode stage, you want:
585
+ (A) load a trained state_dict from disk, OR
586
+ (B) keep a tiny trained model in memory, OR
587
+ (C) use microGPT purely as a sampler over a learned char vocab.
588
+
589
+ This class is the stable interface. Plug your implementation later.
590
+ """
591
+
592
+ def __init__(self):
593
+ # If you already have trained weights, add:
594
+ # self.state_dict = load(...)
595
+ pass
596
+
597
+ def generate(self, prompt: str, temperature: float = 0.8, max_new_tokens: int = 64) -> str:
598
+ # Minimal safe fallback: return prompt as “decoded” scaffold.
599
+ # Replace this with your microgpt forward+sampling once you have weights.
600
+ # (This keeps the pipeline callable today.)
601
+ return f"[microgpt_stub] {prompt}"
example.md ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ AI組:
2
+
3
+ 請輸入文字:先思考:在 AI 時代,什麼樣的人才不會被取代?我的答案是:具備溝通能力的人、擁有韌性的人,以及始終願意站在第一線的人。
4
+
5
+ human score: 47.13
6
+ label: ai_slop_like
7
+ mean: 4.6324
8
+ var: 0.2266
9
+ peak: 0.3447
10
+ len_var: 0.2899
11
+ continuity: 0.1678
12
+ punct_ratio: 0.1207
13
+ formal_punct_ratio: 0.1034
14
+
15
+ 請輸入文字:這次轉職,我給自己的目標是「不重蹈覆轍」:拒絕因焦慮而盲目投遞,只做自己能力所及且擅長的事,採取精準打擊而非海量投遞。
16
+
17
+ human score: 41.89
18
+ label: ai_slop_like
19
+ mean: 3.7679
20
+ var: 2.5905
21
+ peak: 1.2078
22
+ len_var: 0.2398
23
+ continuity: 0.1291
24
+ punct_ratio: 0.1186
25
+ formal_punct_ratio: 0.1186
26
+
27
+ 請輸入文字:現在這台「電腦」已經可以跑 shell 指令,也透過 WebAssembly 放到瀏覽器上,任何人都能直接打開體驗。
28
+
29
+ human score: 45.65
30
+ label: ai_slop_like
31
+ mean: 4.4023
32
+ var: 0.6829
33
+ peak: 0.5784
34
+ len_var: 2.4598
35
+ continuity: 0.1695
36
+ punct_ratio: 0.0926
37
+ formal_punct_ratio: 0.0926
38
+
39
+ 人類留言組:
40
+
41
+ 請輸入文字:"打一槍冷靜一下比較好,真吃了以後問題會很多..."
42
+
43
+ human score: 68.71
44
+ label: maybe_human_like
45
+ mean: 4.8901
46
+ var: 0.0112
47
+ peak: 0.0889
48
+ len_var: 0.2314
49
+ continuity: 0.2634
50
+ punct_ratio: 0.1538
51
+ formal_punct_ratio: 0.0385
52
+
53
+ 請輸入文字:身為專業的肥宅 都會把脂肪放在身上
54
+
55
+ human score: 76.88
56
+ label: maybe_human_like
57
+ mean: 4.8223
58
+ var: 0.0111
59
+ peak: 0.1155
60
+ len_var: 0.3951
61
+ continuity: 0.0912
62
+ punct_ratio: 0.0
63
+ formal_punct_ratio: 0.0
pipeowl.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc4b7463c8348458ecbb8ba5d9ba9a8805a51c4dc858756735f7e8eeb6d0a146
3
+ size 269433956
ptt.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90a6bf5b91e0d0a74b91e7e665f93561649dec60dc5219aced2ff88d0ae3096c
3
+ size 4193648
quickstart.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from engine import PipeOwlEngine, PipeOwlConfig
2
+ import time
3
+
4
+ #=== timetest ===
5
+ """
6
+ t0 = time.perf_counter()
7
+ """
8
+ #================
9
+
10
+ engine = PipeOwlEngine(PipeOwlConfig())
11
+
12
+ #=== timetest ===
13
+ """
14
+ t1 = time.perf_counter()
15
+ print(f"\n🚀 Cold start time: {(t1 - t0)*1000:.2f} ms\n")
16
+ #""
17
+ for _ in range(20):
18
+ t0 = time.perf_counter()
19
+ engine.pipeowl("雪鴞")
20
+ print((time.perf_counter() - t0) * 1000, "ms")
21
+ """
22
+ #================
23
+
24
+ while True:
25
+
26
+ print()
27
+ query = input("請輸入句子: ")
28
+
29
+ out = engine.pipeowl(query, top_k=5)
30
+
31
+ print("\nTop-K Tokens:")
32
+ for text, score in out["retrieved"]:
33
+ print(f"{score:.3f} | {text}")
34
+
35
+ # print("\nDecoded:")
36
+ # print(out["decoded"])
37
+
38
+ print()
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff