|
|
|
|
| from pathlib import Path
|
| import numpy as np
|
|
|
| from engine import PipeOwlEngine, PipeOwlConfig
|
|
|
| BASE_DIR = Path(__file__).resolve().parent
|
| FIELD_PATH = BASE_DIR / "ptt.npy"
|
|
|
| PUNCT = set(",。!?、;:,.!?;:()()[]【】「」『』《》〈〉\"'`~…—-_ ")
|
| STOP = set("的一是在有和人不我他你它這那就都也很到說要會可以的了嗎啊吧啦喔")
|
| PUNCT_STRONG = set(",。;:「」『』()()、,.!?!?:;")
|
| PUNCT_FORMAL = set(",。;:「」『』()()、,.!:;")
|
| CASUAL_PUNCT = set("!?!?~~=wW哈ㄏXDxd.")
|
| FORMAL_PUNCT = set(",。;:「」『』()()、,:;")
|
|
|
| def is_valid_style_token(tok: str) -> bool:
|
| tok = tok.strip()
|
| if not tok:
|
| return False
|
|
|
|
|
| if all(ch in PUNCT for ch in tok):
|
| return False
|
|
|
|
|
| if len(tok) == 1 and tok in STOP:
|
| return False
|
|
|
| return True
|
|
|
| class PTTScorer:
|
| def __init__(self):
|
| self.engine = PipeOwlEngine(PipeOwlConfig())
|
| self.field = np.load(FIELD_PATH).astype(np.float32)
|
|
|
| def score(self, text: str):
|
| tokens = self.engine.tokenizer.tokenize(text)
|
|
|
| vals = []
|
| used = []
|
| vecs = []
|
|
|
| chars = [ch for ch in text if not ch.isspace()]
|
| punct_count = sum(ch in PUNCT_STRONG for ch in chars)
|
| formal_punct_count = sum(ch in PUNCT_FORMAL for ch in chars)
|
|
|
| punct_ratio = punct_count / max(1, len(chars))
|
| formal_punct_ratio = formal_punct_count / max(1, len(chars))
|
|
|
| paren_count = text.count("(") + text.count(")") + text.count("(") + text.count(")")
|
| quote_count = text.count("「") + text.count("」") + text.count('"') + text.count("'")
|
|
|
| chars = [ch for ch in text if not ch.isspace()]
|
|
|
| casual_punct_count = sum(ch in CASUAL_PUNCT for ch in chars)
|
| formal_punct_count = sum(ch in FORMAL_PUNCT for ch in chars)
|
|
|
| casual_punct_ratio = casual_punct_count / max(1, len(chars))
|
| formal_punct_ratio = formal_punct_count / max(1, len(chars))
|
|
|
| formal_types = set(ch for ch in chars if ch in FORMAL_PUNCT)
|
| casual_types = set(ch for ch in chars if ch in CASUAL_PUNCT)
|
|
|
| for tok in tokens:
|
| idx = self.engine.token_to_id.get(tok)
|
| if idx is None:
|
| continue
|
|
|
| if idx is not None:
|
| vecs.append(self.engine.emb[idx])
|
|
|
| val = float(self.field[idx])
|
|
|
| if not is_valid_style_token(tok):
|
| used.append((tok, val, "ignored"))
|
| continue
|
|
|
| vals.append(val)
|
| used.append((tok, val, "used"))
|
|
|
| sim_diffs = []
|
|
|
| for i in range(len(vecs) - 1):
|
| v1 = vecs[i]
|
| v2 = vecs[i + 1]
|
| sim = float(np.dot(v1, v2))
|
| sim_diffs.append(sim)
|
|
|
| if sim_diffs:
|
| continuity = float(np.mean(sim_diffs))
|
| else:
|
| continuity = 0.0
|
|
|
| if not vals:
|
| return {
|
| "score": 0.0,
|
| "label": "unknown",
|
| "tokens": []
|
| }
|
|
|
|
|
| vals = np.array(vals, dtype=np.float32)
|
|
|
| mean = float(np.mean(vals))
|
| var = float(np.var(vals))
|
| peak = float(np.max(vals) - mean)
|
|
|
| lengths = np.array([len(tok) for tok, *_ in used if _[-1] != "ignored"], dtype=np.float32)
|
| len_var = float(np.var(lengths)) if len(lengths) > 0 else 0.0
|
|
|
| raw_score = (
|
| mean
|
| + 0.30 * var
|
| + 0.20 * peak
|
| + 0.10 * len_var
|
| - 4.0 * continuity
|
| )
|
|
|
| clean_structure = 1.0 if (len_var > 3.0 and var > 2.0 and continuity > 0.12) else 0.0
|
| raw_score -= 1.2 * clean_structure
|
|
|
|
|
| raw_score -= 10.0 * formal_punct_ratio
|
| raw_score -= 0.25 * paren_count
|
| raw_score -= 0.20 * quote_count
|
|
|
|
|
| if casual_punct_ratio > 0:
|
| raw_score += min(0.8, casual_punct_ratio * 3.0)
|
|
|
|
|
| if len(casual_types) == 1 and casual_punct_count >= 2:
|
| raw_score += 0.7
|
|
|
|
|
| if len(formal_types) >= 3:
|
| raw_score -= 0.8
|
|
|
|
|
| if formal_punct_count <= 2 and formal_types.issubset({",", "。"}):
|
| raw_score += 0.3
|
|
|
|
|
| score_0_100 = (raw_score - 3.0) * 12 + 55
|
| score_0_100 = max(0, min(100, score_0_100))
|
|
|
| if score_0_100 >= 75:
|
| label = "human_like"
|
| elif score_0_100 >= 60:
|
| label = "maybe_human_like"
|
| else:
|
| label = "ai_slop_like"
|
|
|
| return {
|
| "score": round(score_0_100, 2),
|
| "raw": round(raw_score, 4),
|
| "mean": round(mean, 4),
|
| "var": round(var, 4),
|
| "peak": round(peak, 4),
|
| "len_var": round(len_var, 4),
|
| "continuity": round(continuity, 4),
|
| "punct_ratio": round(punct_ratio, 4),
|
| "formal_punct_ratio": round(formal_punct_ratio, 4),
|
| "label": label,
|
| "tokens": used,
|
| }
|
|
|
|
|
| if __name__ == "__main__":
|
| scorer = PTTScorer()
|
|
|
| while True:
|
| text = input("\n請輸入文字:").strip()
|
| if text.lower() in {"exit", "quit"}:
|
| break
|
|
|
| out = scorer.score(text)
|
|
|
| print("\nhuman score:", out["score"])
|
| print("label:", out["label"])
|
|
|
| print("mean:", out["mean"])
|
| print("var:", out["var"])
|
| print("peak:", out["peak"])
|
| print("len_var:", out["len_var"])
|
| print("continuity:", out["continuity"])
|
| print("punct_ratio:", out["punct_ratio"])
|
| print("formal_punct_ratio:", out["formal_punct_ratio"])
|
|
|
| print("\nTokens:")
|
| for item in out["tokens"]:
|
| if len(item) == 3:
|
| tok, val, flag = item
|
| print(f"{val:.3f} | {flag:7} | {tok}")
|
| else:
|
| tok, val = item
|
| print(f"{val:.3f} | {tok}") |