# axis_core/normalizer.py from __future__ import annotations import re from typing import Any, Dict, List, Tuple # --- 正規化(DB保存前) ------------------------------------------------- _PROPERISH_RE = re.compile(r"\b([A-Z][a-z]+(?:[-'][A-Za-z]+)*)\b") # Hawking, Hayden-Preskill など _NUMBER_RE = re.compile(r"\b\d+(?:\.\d+)?\b") _WS_RE = re.compile(r"\s+") # よく混入して壊す固有名詞系(必要なら追加) _PROPER_STOP = { "Hawking", "Hayden", "Preskill", "Page", "Python", "Lunch", "No", "Cloning", "Equivalence", "Principle", } def _generalize_text(s: str) -> str: if not s: return s x = s.strip() # 数字の一般化 x = _NUMBER_RE.sub("", x) # いかにも固有名詞っぽい英単語を軽く一般化(全消しはしない) def repl(m: re.Match) -> str: w = m.group(1) if w in _PROPER_STOP: return "" # 先頭大文字が続く語を全部消すと情報が落ちすぎるので、長いものだけ置換 if len(w) >= 8: return "" return w x = _PROPERISH_RE.sub(repl, x) x = _WS_RE.sub(" ", x) return x def normalize_template_item(item: Dict[str, Any]) -> Dict[str, Any]: """ DB保存前に「固有問題の残骸」を薄め、汎用テンプレとして再利用可能にする。 - mined_template の schema bullets を正規化 - trigger / tags は残す(ただし過剰に長い trigger は切る) """ t = dict(item) if isinstance(t.get("trigger"), list): t["trigger"] = [str(x)[:40] for x in t["trigger"][:24]] schema = t.get("schema") if isinstance(schema, dict): ns = dict(schema) for k in ("construct", "impossible", "contradiction", "notes"): if isinstance(ns.get(k), list): ns[k] = [_generalize_text(str(b))[:240] for b in ns[k][:12]] t["schema"] = ns # metaも短縮 meta = t.get("meta") if isinstance(meta, dict): nm = dict(meta) if "q_snip" in nm: nm["q_snip"] = _generalize_text(str(nm["q_snip"]))[:120] t["meta"] = nm return t