WangKaiLin's picture
Upload 10 files (#1)
1591c0c
# ptt_score.py
from pathlib import Path
import numpy as np
from engine import PipeOwlEngine, PipeOwlConfig
BASE_DIR = Path(__file__).resolve().parent
FIELD_PATH = BASE_DIR / "ptt.npy"
PUNCT = set(",。!?、;:,.!?;:()()[]【】「」『』《》〈〉\"'`~…—-_ ")
STOP = set("的一是在有和人不我他你它這那就都也很到說要會可以的了嗎啊吧啦喔")
PUNCT_STRONG = set(",。;:「」『』()()、,.!?!?:;")
PUNCT_FORMAL = set(",。;:「」『』()()、,.!:;")
CASUAL_PUNCT = set("!?!?~~=wW哈ㄏXDxd.")
FORMAL_PUNCT = set(",。;:「」『』()()、,:;")
def is_valid_style_token(tok: str) -> bool:
tok = tok.strip()
if not tok:
return False
# 標點不算分
if all(ch in PUNCT for ch in tok):
return False
# 單字常見虛詞不算分
if len(tok) == 1 and tok in STOP:
return False
return True
class PTTScorer:
def __init__(self):
self.engine = PipeOwlEngine(PipeOwlConfig())
self.field = np.load(FIELD_PATH).astype(np.float32)
def score(self, text: str):
tokens = self.engine.tokenizer.tokenize(text)
vals = []
used = []
vecs = []
chars = [ch for ch in text if not ch.isspace()]
punct_count = sum(ch in PUNCT_STRONG for ch in chars)
formal_punct_count = sum(ch in PUNCT_FORMAL for ch in chars)
punct_ratio = punct_count / max(1, len(chars))
formal_punct_ratio = formal_punct_count / max(1, len(chars))
paren_count = text.count("(") + text.count(")") + text.count("(") + text.count(")")
quote_count = text.count("「") + text.count("」") + text.count('"') + text.count("'")
chars = [ch for ch in text if not ch.isspace()]
casual_punct_count = sum(ch in CASUAL_PUNCT for ch in chars)
formal_punct_count = sum(ch in FORMAL_PUNCT for ch in chars)
casual_punct_ratio = casual_punct_count / max(1, len(chars))
formal_punct_ratio = formal_punct_count / max(1, len(chars))
formal_types = set(ch for ch in chars if ch in FORMAL_PUNCT)
casual_types = set(ch for ch in chars if ch in CASUAL_PUNCT)
for tok in tokens:
idx = self.engine.token_to_id.get(tok)
if idx is None:
continue
if idx is not None:
vecs.append(self.engine.emb[idx])
val = float(self.field[idx])
if not is_valid_style_token(tok):
used.append((tok, val, "ignored"))
continue
vals.append(val)
used.append((tok, val, "used"))
sim_diffs = []
for i in range(len(vecs) - 1):
v1 = vecs[i]
v2 = vecs[i + 1]
sim = float(np.dot(v1, v2))
sim_diffs.append(sim)
if sim_diffs:
continuity = float(np.mean(sim_diffs))
else:
continuity = 0.0
if not vals:
return {
"score": 0.0,
"label": "unknown",
"tokens": []
}
# 平均值:整段文字像不像 PTT
vals = np.array(vals, dtype=np.float32)
mean = float(np.mean(vals))
var = float(np.var(vals))
peak = float(np.max(vals) - mean)
lengths = np.array([len(tok) for tok, *_ in used if _[-1] != "ignored"], dtype=np.float32)
len_var = float(np.var(lengths)) if len(lengths) > 0 else 0.0
raw_score = (
mean
+ 0.30 * var
+ 0.20 * peak
+ 0.10 * len_var
- 4.0 * continuity
)
clean_structure = 1.0 if (len_var > 3.0 and var > 2.0 and continuity > 0.12) else 0.0
raw_score -= 1.2 * clean_structure
# 標點/格式懲罰
raw_score -= 10.0 * formal_punct_ratio
raw_score -= 0.25 * paren_count
raw_score -= 0.20 * quote_count
# 口語標點不扣,少量加分
if casual_punct_ratio > 0:
raw_score += min(0.8, casual_punct_ratio * 3.0)
# 如果只有一種口語標點,而且重複很多,視為人類口語
if len(casual_types) == 1 and casual_punct_count >= 2:
raw_score += 0.7
# 如果正式標點種類很多,像文章/AI
if len(formal_types) >= 3:
raw_score -= 0.8
# 只有一組「,」「。」不重扣
if formal_punct_count <= 2 and formal_types.issubset({",", "。"}):
raw_score += 0.3
# 轉成 0~100 分
score_0_100 = (raw_score - 3.0) * 12 + 55
score_0_100 = max(0, min(100, score_0_100))
if score_0_100 >= 75:
label = "human_like"
elif score_0_100 >= 60:
label = "maybe_human_like"
else:
label = "ai_slop_like"
return {
"score": round(score_0_100, 2),
"raw": round(raw_score, 4),
"mean": round(mean, 4),
"var": round(var, 4),
"peak": round(peak, 4),
"len_var": round(len_var, 4),
"continuity": round(continuity, 4),
"punct_ratio": round(punct_ratio, 4),
"formal_punct_ratio": round(formal_punct_ratio, 4),
"label": label,
"tokens": used,
}
if __name__ == "__main__":
scorer = PTTScorer()
while True:
text = input("\n請輸入文字:").strip()
if text.lower() in {"exit", "quit"}:
break
out = scorer.score(text)
print("\nhuman score:", out["score"])
print("label:", out["label"])
print("mean:", out["mean"])
print("var:", out["var"])
print("peak:", out["peak"])
print("len_var:", out["len_var"])
print("continuity:", out["continuity"])
print("punct_ratio:", out["punct_ratio"])
print("formal_punct_ratio:", out["formal_punct_ratio"])
print("\nTokens:")
for item in out["tokens"]:
if len(item) == 3:
tok, val, flag = item
print(f"{val:.3f} | {flag:7} | {tok}")
else:
tok, val = item
print(f"{val:.3f} | {tok}")