File size: 3,215 Bytes
28ab919 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
# core/unit_utils.py
from __future__ import annotations
import re
from typing import Dict, Any, Tuple, List
# 単位→換算(円基準)
UNIT_SCALE = {
"円": 1.0,
"千円": 1e3,
"万円": 1e4,
"百万円": 1e6,
"千万円": 1e7,
"億円": 1e8,
"十億円": 1e10,
# 英語表現
"thousands of yen": 1e3,
"thousand yen": 1e3,
"in thousands of yen": 1e3,
"millions of yen": 1e6,
"in millions of yen": 1e6,
"jpy in thousands": 1e3,
"jpy in millions": 1e6,
}
# よく使うパターンを包括的に
_PATTERNS = [
r"単位[::\s]*?(千円|百万円|万円|円|千万円|億円|十億円)",
r"(?:Amounts?|Figures?)\s+in\s+(thousands|millions)\s+of\s+yen",
r"(?:JPY|YEN)\s+in\s+(thousands|millions)",
r"(?:in\s+)?(thousands|millions)\s+of\s+yen",
]
def detect_unit_scale(text: str) -> Tuple[float, str, List[str]]:
"""
テキストから単位を推定し (scale, label, hits) を返す。
見つからない場合は (1.0, '円', [])。
複数ヒット時は多数決→先勝で決定。
"""
hits: List[str] = []
t = text.lower()
for pat in _PATTERNS:
for m in re.finditer(pat, text, flags=re.IGNORECASE):
g = m.group(1).lower()
if g in ("千円", "百万円", "万円", "円", "千万円", "億円", "十億円"):
hits.append(g)
elif g in ("thousands", "thousand"):
hits.append("thousands of yen")
elif g in ("millions", "million"):
hits.append("millions of yen")
if not hits:
return 1.0, "円", []
# 多数決
from collections import Counter
cnt = Counter(hits)
label = cnt.most_common(1)[0][0]
# ラベル正規化(日本語優先)
if label in ("thousands of yen", "thousand yen", "in thousands of yen", "jpy in thousands"):
label = "千円"
if label in ("millions of yen", "in millions of yen", "jpy in millions"):
label = "百万円"
scale = UNIT_SCALE.get(label, 1.0)
return scale, label, hits
def _scale_number(v: Any, scale: float) -> Any:
if v is None:
return None
try:
f = float(v)
except Exception:
return v
return f * scale
def apply_unit_scale(fin: Dict[str, Any], scale: float) -> Dict[str, Any]:
"""
抽出JSON(balance_sheet / income_statement / cash_flows の数値)に換算を適用。
比率などが混ざるケースを想定し、-1~1の狭い値はそのまま残す (誤変換防止)。
"""
def maybe_scale(x):
if x is None:
return None
try:
f = float(x)
except Exception:
return x
# 典型的な比率やマージンらしき小さい値はスキップ
if -1.0 <= f <= 1.0 and scale > 1:
return f
return f * scale
out = {k: (v.copy() if isinstance(v, dict) else v) for k, v in fin.items()}
for sec in ("balance_sheet", "income_statement", "cash_flows"):
d = out.get(sec, {})
if isinstance(d, dict):
for k, v in d.items():
d[k] = maybe_scale(v)
out[sec] = d
return out
|