Piyizhixing / app /normalization.py
frdywud's picture
Upload 182 files
dbb49bb verified
from __future__ import annotations
from typing import List, Dict
import os
import json
from .config import settings
SYNONYM_MAP = {
# ็—‡็ŠถๅŒไน‰
"่ƒƒ่„˜่ƒ€": "่ƒ€ๆปก",
"่ƒƒ่ƒ€": "่ƒ€ๆปก",
"่„˜้—ท": "่ƒ€ๆปก",
"ๅฟƒ็ช่ƒ€": "่ƒ€ๆปก",
"ๅ—ณ่…": "ๅ—ณๆฐ”",
"็ƒงๅฟƒ": "ๅ้…ธ",
# ่ˆŒ่„‰
"่ˆŒ่‹”ๅŽš่…ป": "่ˆŒ่‹”็™ฝ่…ป",
"่‹”็™ฝ่…ป": "่ˆŒ่‹”็™ฝ่…ป",
# ่ฏๅž‹/็—…ๆœบๅ…ณ้”ฎ่ฏ
"่‚้ƒ": "่‚ๆฐ”้ƒ็ป“",
"ๆฐ”ๆœบไธ็•…": "ๆฐ”ๆปž",
"้ฃŸๆปž": "้ฅฎ้ฃŸ็งฏๆปž",
"็—ฐๆนฟ": "็—ฐๆนฟไธญ้˜ป",
"่„พ่™š": "่„พ่ƒƒ่™šๅผฑ",
"้˜ณ่™š": "่„พ่ƒƒ่™šๅฏ’",
}
_external_map: Dict[str, str] | None = None
def _load_external_synonyms() -> Dict[str, str]:
global _external_map
if _external_map is not None:
return _external_map
path = settings.synonyms_path
mapping: Dict[str, str] = {}
if os.path.isfile(path):
try:
text = open(path, "r", encoding="utf-8").read()
# ๅ…่ฎธ YAML ๆˆ– JSON๏ผ›ๆ—  PyYAML ๆ—ถๅš่ฝป้‡็บง่งฃๆž
if path.endswith(".json") or text.strip().startswith("{"):
mapping = json.loads(text)
else:
# ็ฎ€ๆ˜“ YAML: ๆ”ฏๆŒ "a: b" ๆฏ่กŒไธ€ๆก๏ผ›ๅฟฝ็•ฅๆณจ้‡ŠไธŽ็ฉบ่กŒ
for line in text.splitlines():
line = line.strip()
if (not line) or line.startswith("#"):
continue
if ":" in line:
k, v = line.split(":", 1)
k = k.strip().strip('"\'')
v = v.strip().strip('"\'')
if k and v:
mapping[k] = v
except Exception:
mapping = {}
_external_map = mapping
return mapping
def normalize_terms(terms: List[str]) -> List[str]:
normalized: List[str] = []
external = _load_external_synonyms()
for t in terms:
if not t:
continue
t2 = t.strip()
t2 = external.get(t2, SYNONYM_MAP.get(t2, t2))
if t2 not in normalized:
normalized.append(t2)
return normalized