# core/unit_utils.py from __future__ import annotations import re from typing import Dict, Any, Tuple, List # 単位→換算(円基準) UNIT_SCALE = { "円": 1.0, "千円": 1e3, "万円": 1e4, "百万円": 1e6, "千万円": 1e7, "億円": 1e8, "十億円": 1e10, # 英語表現 "thousands of yen": 1e3, "thousand yen": 1e3, "in thousands of yen": 1e3, "millions of yen": 1e6, "in millions of yen": 1e6, "jpy in thousands": 1e3, "jpy in millions": 1e6, } # よく使うパターンを包括的に _PATTERNS = [ r"単位[::\s]*?(千円|百万円|万円|円|千万円|億円|十億円)", r"(?:Amounts?|Figures?)\s+in\s+(thousands|millions)\s+of\s+yen", r"(?:JPY|YEN)\s+in\s+(thousands|millions)", r"(?:in\s+)?(thousands|millions)\s+of\s+yen", ] def detect_unit_scale(text: str) -> Tuple[float, str, List[str]]: """ テキストから単位を推定し (scale, label, hits) を返す。 見つからない場合は (1.0, '円', [])。 複数ヒット時は多数決→先勝で決定。 """ hits: List[str] = [] t = text.lower() for pat in _PATTERNS: for m in re.finditer(pat, text, flags=re.IGNORECASE): g = m.group(1).lower() if g in ("千円", "百万円", "万円", "円", "千万円", "億円", "十億円"): hits.append(g) elif g in ("thousands", "thousand"): hits.append("thousands of yen") elif g in ("millions", "million"): hits.append("millions of yen") if not hits: return 1.0, "円", [] # 多数決 from collections import Counter cnt = Counter(hits) label = cnt.most_common(1)[0][0] # ラベル正規化(日本語優先) if label in ("thousands of yen", "thousand yen", "in thousands of yen", "jpy in thousands"): label = "千円" if label in ("millions of yen", "in millions of yen", "jpy in millions"): label = "百万円" scale = UNIT_SCALE.get(label, 1.0) return scale, label, hits def _scale_number(v: Any, scale: float) -> Any: if v is None: return None try: f = float(v) except Exception: return v return f * scale def apply_unit_scale(fin: Dict[str, Any], scale: float) -> Dict[str, Any]: """ 抽出JSON(balance_sheet / income_statement / cash_flows の数値)に換算を適用。 比率などが混ざるケースを想定し、-1~1の狭い値はそのまま残す (誤変換防止)。 """ def maybe_scale(x): if x is None: return None try: f = float(x) except Exception: return x # 典型的な比率やマージンらしき小さい値はスキップ if -1.0 <= f <= 1.0 and scale > 1: return f return f * scale out = {k: (v.copy() if isinstance(v, dict) else v) for k, v in fin.items()} for sec in ("balance_sheet", "income_statement", "cash_flows"): d = out.get(sec, {}) if isinstance(d, dict): for k, v in d.items(): d[k] = maybe_scale(v) out[sec] = d return out