Upload 10 files (#1)

1591c0c about 1 month ago

6.45 kB

	# ptt_score.py

	from pathlib import Path
	import numpy as np

	from engine import PipeOwlEngine, PipeOwlConfig

	BASE_DIR = Path(__file__).resolve().parent
	FIELD_PATH = BASE_DIR / "ptt.npy"

	PUNCT = set("，。！？、；：,.!?;:()（）[]【】「」『』《》〈〉\"'`~…—-_ ")
	STOP = set("的一是在有和人不我他你它這那就都也很到說要會可以的了嗎啊吧啦喔")
	PUNCT_STRONG = set("，。；：「」『』（）()、,.!?！？:;")
	PUNCT_FORMAL = set("，。；：「」『』（）()、,.!:;")
	CASUAL_PUNCT = set("!?！？~～=wW哈ㄏXDxd.")
	FORMAL_PUNCT = set("，。；：「」『』（）()、,：;")

	def is_valid_style_token(tok: str) -> bool:
	tok = tok.strip()
	if not tok:
	return False

	# 標點不算分
	if all(ch in PUNCT for ch in tok):
	return False

	# 單字常見虛詞不算分
	if len(tok) == 1 and tok in STOP:
	return False

	return True

	class PTTScorer:
	def __init__(self):
	self.engine = PipeOwlEngine(PipeOwlConfig())
	self.field = np.load(FIELD_PATH).astype(np.float32)

	def score(self, text: str):
	tokens = self.engine.tokenizer.tokenize(text)

	vals = []
	used = []
	vecs = []

	chars = [ch for ch in text if not ch.isspace()]
	punct_count = sum(ch in PUNCT_STRONG for ch in chars)
	formal_punct_count = sum(ch in PUNCT_FORMAL for ch in chars)

	punct_ratio = punct_count / max(1, len(chars))
	formal_punct_ratio = formal_punct_count / max(1, len(chars))

	paren_count = text.count("(") + text.count(")") + text.count("（") + text.count("）")
	quote_count = text.count("「") + text.count("」") + text.count('"') + text.count("'")

	chars = [ch for ch in text if not ch.isspace()]

	casual_punct_count = sum(ch in CASUAL_PUNCT for ch in chars)
	formal_punct_count = sum(ch in FORMAL_PUNCT for ch in chars)

	casual_punct_ratio = casual_punct_count / max(1, len(chars))
	formal_punct_ratio = formal_punct_count / max(1, len(chars))

	formal_types = set(ch for ch in chars if ch in FORMAL_PUNCT)
	casual_types = set(ch for ch in chars if ch in CASUAL_PUNCT)

	for tok in tokens:
	idx = self.engine.token_to_id.get(tok)
	if idx is None:
	continue

	if idx is not None:
	vecs.append(self.engine.emb[idx])

	val = float(self.field[idx])

	if not is_valid_style_token(tok):
	used.append((tok, val, "ignored"))
	continue

	vals.append(val)
	used.append((tok, val, "used"))

	sim_diffs = []

	for i in range(len(vecs) - 1):
	v1 = vecs[i]
	v2 = vecs[i + 1]
	sim = float(np.dot(v1, v2))
	sim_diffs.append(sim)

	if sim_diffs:
	continuity = float(np.mean(sim_diffs))
	else:
	continuity = 0.0

	if not vals:
	return {
	"score": 0.0,
	"label": "unknown",
	"tokens": []
	}

	# 平均值：整段文字像不像 PTT
	vals = np.array(vals, dtype=np.float32)

	mean = float(np.mean(vals))
	var = float(np.var(vals))
	peak = float(np.max(vals) - mean)

	lengths = np.array([len(tok) for tok, *_ in used if _[-1] != "ignored"], dtype=np.float32)
	len_var = float(np.var(lengths)) if len(lengths) > 0 else 0.0

	raw_score = (
	mean
	+ 0.30 * var
	+ 0.20 * peak
	+ 0.10 * len_var
	- 4.0 * continuity
	)

	clean_structure = 1.0 if (len_var > 3.0 and var > 2.0 and continuity > 0.12) else 0.0
	raw_score -= 1.2 * clean_structure

	# 標點/格式懲罰
	raw_score -= 10.0 * formal_punct_ratio
	raw_score -= 0.25 * paren_count
	raw_score -= 0.20 * quote_count

	# 口語標點不扣，少量加分
	if casual_punct_ratio > 0:
	raw_score += min(0.8, casual_punct_ratio * 3.0)

	# 如果只有一種口語標點，而且重複很多，視為人類口語
	if len(casual_types) == 1 and casual_punct_count >= 2:
	raw_score += 0.7

	# 如果正式標點種類很多，像文章/AI
	if len(formal_types) >= 3:
	raw_score -= 0.8

	# 只有一組「，」「。」不重扣
	if formal_punct_count <= 2 and formal_types.issubset({"，", "。"}):
	raw_score += 0.3

	# 轉成 0~100 分
	score_0_100 = (raw_score - 3.0) * 12 + 55
	score_0_100 = max(0, min(100, score_0_100))

	if score_0_100 >= 75:
	label = "human_like"
	elif score_0_100 >= 60:
	label = "maybe_human_like"
	else:
	label = "ai_slop_like"

	return {
	"score": round(score_0_100, 2),
	"raw": round(raw_score, 4),
	"mean": round(mean, 4),
	"var": round(var, 4),
	"peak": round(peak, 4),
	"len_var": round(len_var, 4),
	"continuity": round(continuity, 4),
	"punct_ratio": round(punct_ratio, 4),
	"formal_punct_ratio": round(formal_punct_ratio, 4),
	"label": label,
	"tokens": used,
	}


	if __name__ == "__main__":
	scorer = PTTScorer()

	while True:
	text = input("\n請輸入文字：").strip()
	if text.lower() in {"exit", "quit"}:
	break

	out = scorer.score(text)

	print("\nhuman score:", out["score"])
	print("label:", out["label"])

	print("mean:", out["mean"])
	print("var:", out["var"])
	print("peak:", out["peak"])
	print("len_var:", out["len_var"])
	print("continuity:", out["continuity"])
	print("punct_ratio:", out["punct_ratio"])
	print("formal_punct_ratio:", out["formal_punct_ratio"])

	print("\nTokens:")
	for item in out["tokens"]:
	if len(item) == 3:
	tok, val, flag = item
	print(f"{val:.3f} \| {flag:7} \| {tok}")
	else:
	tok, val = item
	print(f"{val:.3f} \| {tok}")