# axis_core/normalizer.py
from __future__ import annotations

import re
from typing import Any, Dict, List, Tuple

# --- 正規化（DB保存前） -------------------------------------------------

_PROPERISH_RE = re.compile(r"\b([A-Z][a-z]+(?:[-'][A-Za-z]+)*)\b")  # Hawking, Hayden-Preskill など
_NUMBER_RE = re.compile(r"\b\d+(?:\.\d+)?\b")
_WS_RE = re.compile(r"\s+")

# よく混入して壊す固有名詞系（必要なら追加）
_PROPER_STOP = {
    "Hawking", "Hayden", "Preskill", "Page", "Python", "Lunch",
    "No", "Cloning", "Equivalence", "Principle",
}

def _generalize_text(s: str) -> str:
    if not s:
        return s
    x = s.strip()

    # 数字の一般化
    x = _NUMBER_RE.sub("<n>", x)

    # いかにも固有名詞っぽい英単語を軽く一般化（全消しはしない）
    def repl(m: re.Match) -> str:
        w = m.group(1)
        if w in _PROPER_STOP:
            return "<NAME>"
        # 先頭大文字が続く語を全部消すと情報が落ちすぎるので、長いものだけ置換
        if len(w) >= 8:
            return "<NAME>"
        return w

    x = _PROPERISH_RE.sub(repl, x)
    x = _WS_RE.sub(" ", x)
    return x

def normalize_template_item(item: Dict[str, Any]) -> Dict[str, Any]:
    """
    DB保存前に「固有問題の残骸」を薄め、汎用テンプレとして再利用可能にする。
    - mined_template の schema bullets を正規化
    - trigger / tags は残す（ただし過剰に長い trigger は切る）
    """
    t = dict(item)

    if isinstance(t.get("trigger"), list):
        t["trigger"] = [str(x)[:40] for x in t["trigger"][:24]]

    schema = t.get("schema")
    if isinstance(schema, dict):
        ns = dict(schema)
        for k in ("construct", "impossible", "contradiction", "notes"):
            if isinstance(ns.get(k), list):
                ns[k] = [_generalize_text(str(b))[:240] for b in ns[k][:12]]
        t["schema"] = ns

    # metaも短縮
    meta = t.get("meta")
    if isinstance(meta, dict):
        nm = dict(meta)
        if "q_snip" in nm:
            nm["q_snip"] = _generalize_text(str(nm["q_snip"]))[:120]
        t["meta"] = nm

    return t