File size: 3,215 Bytes
28ab919
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# core/unit_utils.py
from __future__ import annotations
import re
from typing import Dict, Any, Tuple, List

# 単位→換算(円基準)
UNIT_SCALE = {
    "円": 1.0,
    "千円": 1e3,
    "万円": 1e4,
    "百万円": 1e6,
    "千万円": 1e7,
    "億円": 1e8,
    "十億円": 1e10,
    # 英語表現
    "thousands of yen": 1e3,
    "thousand yen": 1e3,
    "in thousands of yen": 1e3,
    "millions of yen": 1e6,
    "in millions of yen": 1e6,
    "jpy in thousands": 1e3,
    "jpy in millions": 1e6,
}

# よく使うパターンを包括的に
_PATTERNS = [
    r"単位[::\s]*?(千円|百万円|万円|円|千万円|億円|十億円)",
    r"(?:Amounts?|Figures?)\s+in\s+(thousands|millions)\s+of\s+yen",
    r"(?:JPY|YEN)\s+in\s+(thousands|millions)",
    r"(?:in\s+)?(thousands|millions)\s+of\s+yen",
]

def detect_unit_scale(text: str) -> Tuple[float, str, List[str]]:
    """
    テキストから単位を推定し (scale, label, hits) を返す。
    見つからない場合は (1.0, '円', [])。
    複数ヒット時は多数決→先勝で決定。
    """
    hits: List[str] = []
    t = text.lower()
    for pat in _PATTERNS:
        for m in re.finditer(pat, text, flags=re.IGNORECASE):
            g = m.group(1).lower()
            if g in ("千円", "百万円", "万円", "円", "千万円", "億円", "十億円"):
                hits.append(g)
            elif g in ("thousands", "thousand"):
                hits.append("thousands of yen")
            elif g in ("millions", "million"):
                hits.append("millions of yen")

    if not hits:
        return 1.0, "円", []

    # 多数決
    from collections import Counter
    cnt = Counter(hits)
    label = cnt.most_common(1)[0][0]

    # ラベル正規化(日本語優先)
    if label in ("thousands of yen", "thousand yen", "in thousands of yen", "jpy in thousands"):
        label = "千円"
    if label in ("millions of yen", "in millions of yen", "jpy in millions"):
        label = "百万円"

    scale = UNIT_SCALE.get(label, 1.0)
    return scale, label, hits

def _scale_number(v: Any, scale: float) -> Any:
    if v is None:
        return None
    try:
        f = float(v)
    except Exception:
        return v
    return f * scale

def apply_unit_scale(fin: Dict[str, Any], scale: float) -> Dict[str, Any]:
    """
    抽出JSON(balance_sheet / income_statement / cash_flows の数値)に換算を適用。
    比率などが混ざるケースを想定し、-1~1の狭い値はそのまま残す (誤変換防止)。
    """
    def maybe_scale(x):
        if x is None:
            return None
        try:
            f = float(x)
        except Exception:
            return x
        # 典型的な比率やマージンらしき小さい値はスキップ
        if -1.0 <= f <= 1.0 and scale > 1:
            return f
        return f * scale

    out = {k: (v.copy() if isinstance(v, dict) else v) for k, v in fin.items()}
    for sec in ("balance_sheet", "income_statement", "cash_flows"):
        d = out.get(sec, {})
        if isinstance(d, dict):
            for k, v in d.items():
                d[k] = maybe_scale(v)
            out[sec] = d
    return out