|
|
|
|
|
from __future__ import annotations |
|
|
|
|
|
import re |
|
|
from typing import Any, Dict, List, Tuple |
|
|
|
|
|
|
|
|
|
|
|
_PROPERISH_RE = re.compile(r"\b([A-Z][a-z]+(?:[-'][A-Za-z]+)*)\b") |
|
|
_NUMBER_RE = re.compile(r"\b\d+(?:\.\d+)?\b") |
|
|
_WS_RE = re.compile(r"\s+") |
|
|
|
|
|
|
|
|
_PROPER_STOP = { |
|
|
"Hawking", "Hayden", "Preskill", "Page", "Python", "Lunch", |
|
|
"No", "Cloning", "Equivalence", "Principle", |
|
|
} |
|
|
|
|
|
def _generalize_text(s: str) -> str: |
|
|
if not s: |
|
|
return s |
|
|
x = s.strip() |
|
|
|
|
|
|
|
|
x = _NUMBER_RE.sub("<n>", x) |
|
|
|
|
|
|
|
|
def repl(m: re.Match) -> str: |
|
|
w = m.group(1) |
|
|
if w in _PROPER_STOP: |
|
|
return "<NAME>" |
|
|
|
|
|
if len(w) >= 8: |
|
|
return "<NAME>" |
|
|
return w |
|
|
|
|
|
x = _PROPERISH_RE.sub(repl, x) |
|
|
x = _WS_RE.sub(" ", x) |
|
|
return x |
|
|
|
|
|
def normalize_template_item(item: Dict[str, Any]) -> Dict[str, Any]: |
|
|
""" |
|
|
DB保存前に「固有問題の残骸」を薄め、汎用テンプレとして再利用可能にする。 |
|
|
- mined_template の schema bullets を正規化 |
|
|
- trigger / tags は残す(ただし過剰に長い trigger は切る) |
|
|
""" |
|
|
t = dict(item) |
|
|
|
|
|
if isinstance(t.get("trigger"), list): |
|
|
t["trigger"] = [str(x)[:40] for x in t["trigger"][:24]] |
|
|
|
|
|
schema = t.get("schema") |
|
|
if isinstance(schema, dict): |
|
|
ns = dict(schema) |
|
|
for k in ("construct", "impossible", "contradiction", "notes"): |
|
|
if isinstance(ns.get(k), list): |
|
|
ns[k] = [_generalize_text(str(b))[:240] for b in ns[k][:12]] |
|
|
t["schema"] = ns |
|
|
|
|
|
|
|
|
meta = t.get("meta") |
|
|
if isinstance(meta, dict): |
|
|
nm = dict(meta) |
|
|
if "q_snip" in nm: |
|
|
nm["q_snip"] = _generalize_text(str(nm["q_snip"]))[:120] |
|
|
t["meta"] = nm |
|
|
|
|
|
return t |