kofdai
/

verantyx

Model card Files Files and versions

verantyx / axis /axis_core /normalizer.py

kofdai's picture

Upload folder using huggingface_hub

6d07351 verified 21 days ago

history blame contribute delete

2.19 kB

	# axis_core/normalizer.py
	from __future__ import annotations

	import re
	from typing import Any, Dict, List, Tuple

	# --- 正規化（DB保存前） -------------------------------------------------

	_PROPERISH_RE = re.compile(r"\b([A-Z][a-z]+(?:[-'][A-Za-z]+)*)\b") # Hawking, Hayden-Preskill など
	_NUMBER_RE = re.compile(r"\b\d+(?:\.\d+)?\b")
	_WS_RE = re.compile(r"\s+")

	# よく混入して壊す固有名詞系（必要なら追加）
	_PROPER_STOP = {
	"Hawking", "Hayden", "Preskill", "Page", "Python", "Lunch",
	"No", "Cloning", "Equivalence", "Principle",
	}

	def _generalize_text(s: str) -> str:
	if not s:
	return s
	x = s.strip()

	# 数字の一般化
	x = _NUMBER_RE.sub("<n>", x)

	# いかにも固有名詞っぽい英単語を軽く一般化（全消しはしない）
	def repl(m: re.Match) -> str:
	w = m.group(1)
	if w in _PROPER_STOP:
	return "<NAME>"
	# 先頭大文字が続く語を全部消すと情報が落ちすぎるので、長いものだけ置換
	if len(w) >= 8:
	return "<NAME>"
	return w

	x = _PROPERISH_RE.sub(repl, x)
	x = _WS_RE.sub(" ", x)
	return x

	def normalize_template_item(item: Dict[str, Any]) -> Dict[str, Any]:
	"""
	DB保存前に「固有問題の残骸」を薄め、汎用テンプレとして再利用可能にする。
	- mined_template の schema bullets を正規化
	- trigger / tags は残す（ただし過剰に長い trigger は切る）
	"""
	t = dict(item)

	if isinstance(t.get("trigger"), list):
	t["trigger"] = [str(x)[:40] for x in t["trigger"][:24]]

	schema = t.get("schema")
	if isinstance(schema, dict):
	ns = dict(schema)
	for k in ("construct", "impossible", "contradiction", "notes"):
	if isinstance(ns.get(k), list):
	ns[k] = [_generalize_text(str(b))[:240] for b in ns[k][:12]]
	t["schema"] = ns

	# metaも短縮
	meta = t.get("meta")
	if isinstance(meta, dict):
	nm = dict(meta)
	if "q_snip" in nm:
	nm["q_snip"] = _generalize_text(str(nm["q_snip"]))[:120]
	t["meta"] = nm

	return t