# ptt_score.py from pathlib import Path import numpy as np from engine import PipeOwlEngine, PipeOwlConfig BASE_DIR = Path(__file__).resolve().parent FIELD_PATH = BASE_DIR / "ptt.npy" PUNCT = set(",。!?、;:,.!?;:()()[]【】「」『』《》〈〉\"'`~…—-_ ") STOP = set("的一是在有和人不我他你它這那就都也很到說要會可以的了嗎啊吧啦喔") PUNCT_STRONG = set(",。;:「」『』()()、,.!?!?:;") PUNCT_FORMAL = set(",。;:「」『』()()、,.!:;") CASUAL_PUNCT = set("!?!?~~=wW哈ㄏXDxd.") FORMAL_PUNCT = set(",。;:「」『』()()、,:;") def is_valid_style_token(tok: str) -> bool: tok = tok.strip() if not tok: return False # 標點不算分 if all(ch in PUNCT for ch in tok): return False # 單字常見虛詞不算分 if len(tok) == 1 and tok in STOP: return False return True class PTTScorer: def __init__(self): self.engine = PipeOwlEngine(PipeOwlConfig()) self.field = np.load(FIELD_PATH).astype(np.float32) def score(self, text: str): tokens = self.engine.tokenizer.tokenize(text) vals = [] used = [] vecs = [] chars = [ch for ch in text if not ch.isspace()] punct_count = sum(ch in PUNCT_STRONG for ch in chars) formal_punct_count = sum(ch in PUNCT_FORMAL for ch in chars) punct_ratio = punct_count / max(1, len(chars)) formal_punct_ratio = formal_punct_count / max(1, len(chars)) paren_count = text.count("(") + text.count(")") + text.count("(") + text.count(")") quote_count = text.count("「") + text.count("」") + text.count('"') + text.count("'") chars = [ch for ch in text if not ch.isspace()] casual_punct_count = sum(ch in CASUAL_PUNCT for ch in chars) formal_punct_count = sum(ch in FORMAL_PUNCT for ch in chars) casual_punct_ratio = casual_punct_count / max(1, len(chars)) formal_punct_ratio = formal_punct_count / max(1, len(chars)) formal_types = set(ch for ch in chars if ch in FORMAL_PUNCT) casual_types = set(ch for ch in chars if ch in CASUAL_PUNCT) for tok in tokens: idx = self.engine.token_to_id.get(tok) if idx is None: continue if idx is not None: vecs.append(self.engine.emb[idx]) val = float(self.field[idx]) if not is_valid_style_token(tok): used.append((tok, val, "ignored")) continue vals.append(val) used.append((tok, val, "used")) sim_diffs = [] for i in range(len(vecs) - 1): v1 = vecs[i] v2 = vecs[i + 1] sim = float(np.dot(v1, v2)) sim_diffs.append(sim) if sim_diffs: continuity = float(np.mean(sim_diffs)) else: continuity = 0.0 if not vals: return { "score": 0.0, "label": "unknown", "tokens": [] } # 平均值:整段文字像不像 PTT vals = np.array(vals, dtype=np.float32) mean = float(np.mean(vals)) var = float(np.var(vals)) peak = float(np.max(vals) - mean) lengths = np.array([len(tok) for tok, *_ in used if _[-1] != "ignored"], dtype=np.float32) len_var = float(np.var(lengths)) if len(lengths) > 0 else 0.0 raw_score = ( mean + 0.30 * var + 0.20 * peak + 0.10 * len_var - 4.0 * continuity ) clean_structure = 1.0 if (len_var > 3.0 and var > 2.0 and continuity > 0.12) else 0.0 raw_score -= 1.2 * clean_structure # 標點/格式懲罰 raw_score -= 10.0 * formal_punct_ratio raw_score -= 0.25 * paren_count raw_score -= 0.20 * quote_count # 口語標點不扣,少量加分 if casual_punct_ratio > 0: raw_score += min(0.8, casual_punct_ratio * 3.0) # 如果只有一種口語標點,而且重複很多,視為人類口語 if len(casual_types) == 1 and casual_punct_count >= 2: raw_score += 0.7 # 如果正式標點種類很多,像文章/AI if len(formal_types) >= 3: raw_score -= 0.8 # 只有一組「,」「。」不重扣 if formal_punct_count <= 2 and formal_types.issubset({",", "。"}): raw_score += 0.3 # 轉成 0~100 分 score_0_100 = (raw_score - 3.0) * 12 + 55 score_0_100 = max(0, min(100, score_0_100)) if score_0_100 >= 75: label = "human_like" elif score_0_100 >= 60: label = "maybe_human_like" else: label = "ai_slop_like" return { "score": round(score_0_100, 2), "raw": round(raw_score, 4), "mean": round(mean, 4), "var": round(var, 4), "peak": round(peak, 4), "len_var": round(len_var, 4), "continuity": round(continuity, 4), "punct_ratio": round(punct_ratio, 4), "formal_punct_ratio": round(formal_punct_ratio, 4), "label": label, "tokens": used, } if __name__ == "__main__": scorer = PTTScorer() while True: text = input("\n請輸入文字:").strip() if text.lower() in {"exit", "quit"}: break out = scorer.score(text) print("\nhuman score:", out["score"]) print("label:", out["label"]) print("mean:", out["mean"]) print("var:", out["var"]) print("peak:", out["peak"]) print("len_var:", out["len_var"]) print("continuity:", out["continuity"]) print("punct_ratio:", out["punct_ratio"]) print("formal_punct_ratio:", out["formal_punct_ratio"]) print("\nTokens:") for item in out["tokens"]: if len(item) == 3: tok, val, flag = item print(f"{val:.3f} | {flag:7} | {tok}") else: tok, val = item print(f"{val:.3f} | {tok}")