Corin1998 commited on
Commit
28ab919
·
verified ·
1 Parent(s): c83ead4

Create unit_utils.py

Browse files
Files changed (1) hide show
  1. core/unit_utils.py +101 -0
core/unit_utils.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # core/unit_utils.py
2
+ from __future__ import annotations
3
+ import re
4
+ from typing import Dict, Any, Tuple, List
5
+
6
+ # 単位→換算(円基準)
7
+ UNIT_SCALE = {
8
+ "円": 1.0,
9
+ "千円": 1e3,
10
+ "万円": 1e4,
11
+ "百万円": 1e6,
12
+ "千万円": 1e7,
13
+ "億円": 1e8,
14
+ "十億円": 1e10,
15
+ # 英語表現
16
+ "thousands of yen": 1e3,
17
+ "thousand yen": 1e3,
18
+ "in thousands of yen": 1e3,
19
+ "millions of yen": 1e6,
20
+ "in millions of yen": 1e6,
21
+ "jpy in thousands": 1e3,
22
+ "jpy in millions": 1e6,
23
+ }
24
+
25
+ # よく使うパターンを包括的に
26
+ _PATTERNS = [
27
+ r"単位[::\s]*?(千円|百万円|万円|円|千万円|億円|十億円)",
28
+ r"(?:Amounts?|Figures?)\s+in\s+(thousands|millions)\s+of\s+yen",
29
+ r"(?:JPY|YEN)\s+in\s+(thousands|millions)",
30
+ r"(?:in\s+)?(thousands|millions)\s+of\s+yen",
31
+ ]
32
+
33
+ def detect_unit_scale(text: str) -> Tuple[float, str, List[str]]:
34
+ """
35
+ テキストから単位を推定し (scale, label, hits) を返す。
36
+ 見つからない場合は (1.0, '円', [])。
37
+ 複数ヒット時は多数決→先勝で決定。
38
+ """
39
+ hits: List[str] = []
40
+ t = text.lower()
41
+ for pat in _PATTERNS:
42
+ for m in re.finditer(pat, text, flags=re.IGNORECASE):
43
+ g = m.group(1).lower()
44
+ if g in ("千円", "百万円", "万円", "円", "千万円", "億円", "十億円"):
45
+ hits.append(g)
46
+ elif g in ("thousands", "thousand"):
47
+ hits.append("thousands of yen")
48
+ elif g in ("millions", "million"):
49
+ hits.append("millions of yen")
50
+
51
+ if not hits:
52
+ return 1.0, "円", []
53
+
54
+ # 多数決
55
+ from collections import Counter
56
+ cnt = Counter(hits)
57
+ label = cnt.most_common(1)[0][0]
58
+
59
+ # ラベル正規化(日本語優先)
60
+ if label in ("thousands of yen", "thousand yen", "in thousands of yen", "jpy in thousands"):
61
+ label = "千円"
62
+ if label in ("millions of yen", "in millions of yen", "jpy in millions"):
63
+ label = "百万円"
64
+
65
+ scale = UNIT_SCALE.get(label, 1.0)
66
+ return scale, label, hits
67
+
68
+ def _scale_number(v: Any, scale: float) -> Any:
69
+ if v is None:
70
+ return None
71
+ try:
72
+ f = float(v)
73
+ except Exception:
74
+ return v
75
+ return f * scale
76
+
77
+ def apply_unit_scale(fin: Dict[str, Any], scale: float) -> Dict[str, Any]:
78
+ """
79
+ 抽出JSON(balance_sheet / income_statement / cash_flows の数値)に換算を適用。
80
+ 比率などが混ざるケースを想定し、-1~1の狭い値はそのまま残す (誤変換防止)。
81
+ """
82
+ def maybe_scale(x):
83
+ if x is None:
84
+ return None
85
+ try:
86
+ f = float(x)
87
+ except Exception:
88
+ return x
89
+ # 典型的な比率やマージンらしき小さい値はスキップ
90
+ if -1.0 <= f <= 1.0 and scale > 1:
91
+ return f
92
+ return f * scale
93
+
94
+ out = {k: (v.copy() if isinstance(v, dict) else v) for k, v in fin.items()}
95
+ for sec in ("balance_sheet", "income_statement", "cash_flows"):
96
+ d = out.get(sec, {})
97
+ if isinstance(d, dict):
98
+ for k, v in d.items():
99
+ d[k] = maybe_scale(v)
100
+ out[sec] = d
101
+ return out