verantyx / axis /axis_core /normalizer.py
kofdai's picture
Upload folder using huggingface_hub
6d07351 verified
# axis_core/normalizer.py
from __future__ import annotations
import re
from typing import Any, Dict, List, Tuple
# --- 正規化(DB保存前) -------------------------------------------------
_PROPERISH_RE = re.compile(r"\b([A-Z][a-z]+(?:[-'][A-Za-z]+)*)\b") # Hawking, Hayden-Preskill など
_NUMBER_RE = re.compile(r"\b\d+(?:\.\d+)?\b")
_WS_RE = re.compile(r"\s+")
# よく混入して壊す固有名詞系(必要なら追加)
_PROPER_STOP = {
"Hawking", "Hayden", "Preskill", "Page", "Python", "Lunch",
"No", "Cloning", "Equivalence", "Principle",
}
def _generalize_text(s: str) -> str:
if not s:
return s
x = s.strip()
# 数字の一般化
x = _NUMBER_RE.sub("<n>", x)
# いかにも固有名詞っぽい英単語を軽く一般化(全消しはしない)
def repl(m: re.Match) -> str:
w = m.group(1)
if w in _PROPER_STOP:
return "<NAME>"
# 先頭大文字が続く語を全部消すと情報が落ちすぎるので、長いものだけ置換
if len(w) >= 8:
return "<NAME>"
return w
x = _PROPERISH_RE.sub(repl, x)
x = _WS_RE.sub(" ", x)
return x
def normalize_template_item(item: Dict[str, Any]) -> Dict[str, Any]:
"""
DB保存前に「固有問題の残骸」を薄め、汎用テンプレとして再利用可能にする。
- mined_template の schema bullets を正規化
- trigger / tags は残す(ただし過剰に長い trigger は切る)
"""
t = dict(item)
if isinstance(t.get("trigger"), list):
t["trigger"] = [str(x)[:40] for x in t["trigger"][:24]]
schema = t.get("schema")
if isinstance(schema, dict):
ns = dict(schema)
for k in ("construct", "impossible", "contradiction", "notes"):
if isinstance(ns.get(k), list):
ns[k] = [_generalize_text(str(b))[:240] for b in ns[k][:12]]
t["schema"] = ns
# metaも短縮
meta = t.get("meta")
if isinstance(meta, dict):
nm = dict(meta)
if "q_snip" in nm:
nm["q_snip"] = _generalize_text(str(nm["q_snip"]))[:120]
t["meta"] = nm
return t