Corin1998 commited on
Commit
c37aeda
·
verified ·
1 Parent(s): 876e0a2

Update core/extract.py

Browse files
Files changed (1) hide show
  1. core/extract.py +166 -62
core/extract.py CHANGED
@@ -1,61 +1,78 @@
1
  # core/extract.py
2
  from __future__ import annotations
3
- import os, io, base64, json, shutil
4
  from typing import List, Dict, Any, Tuple
5
 
6
- import pdfplumber
7
  from pdf2image import convert_from_path
 
 
 
8
  from openai import OpenAI
9
 
10
- from core.unit_utils import detect_unit_scale, apply_unit_scale
11
 
12
  OPENAI_MODEL_VISION = os.environ.get("OPENAI_VISION_MODEL", "gpt-4o-mini")
13
  OPENAI_MODEL_TEXT = os.environ.get("OPENAI_TEXT_MODEL", "gpt-4o-mini")
14
 
15
- SYSTEM_JSON = """あなたは有能な財務アナリストです。
16
- 与えられた決算書(画像またはテキスト)から、次の厳密な JSON 構造のみを日本語の単位なし・半角数値で返してください。分からない項目は null。
17
- {
18
- "company": {"name": null},
19
- "period": {"start_date": null, "end_date": null},
20
- "balance_sheet": {
21
- "total_assets": null, "total_liabilities": null, "total_equity": null,
22
- "current_assets": null, "fixed_assets": null,
23
- "current_liabilities": null, "long_term_liabilities": null
24
- },
25
- "income_statement": {
26
- "sales": null, "cost_of_sales": null, "gross_profit": null,
27
- "operating_expenses": null, "operating_income": null,
28
- "ordinary_income": null, "net_income": null
29
- },
30
- "cash_flows": {
31
- "operating_cash_flow": null, "investing_cash_flow": null, "financing_cash_flow": null
32
- }
33
- }
34
- """
35
 
 
36
  def _b64(img: bytes) -> str:
37
- import base64
38
  return base64.b64encode(img).decode("utf-8")
39
 
40
  def _client() -> OpenAI:
 
 
 
 
 
 
 
 
 
 
 
 
41
  key = os.environ.get("OPENAI_API_KEY")
42
  if not key:
43
- raise RuntimeError("OPENAI_API_KEY が未設定です。Spaces Settings → Variables and secrets に設定してください。")
44
- # proxies は渡さない(httpx==0.27系で互換)
45
- return OpenAI(api_key=key, timeout=30)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
 
 
47
  def pdf_to_images(pdf_path: str, dpi: int = 220, max_pages: int = 6) -> List[bytes]:
48
- pages = convert_from_path(pdf_path, dpi=dpi, fmt="png")
49
  out: List[bytes] = []
50
- for i, p in enumerate(pages):
51
  if i >= max_pages:
52
  break
53
  buf = io.BytesIO()
54
- p.save(buf, format="PNG")
55
  out.append(buf.getvalue())
56
  return out
57
 
58
- def pdf_to_text(pdf_path: str, max_chars: int = 16000) -> str:
59
  chunks: List[str] = []
60
  with pdfplumber.open(pdf_path) as pdf:
61
  for i, page in enumerate(pdf.pages):
@@ -66,13 +83,87 @@ def pdf_to_text(pdf_path: str, max_chars: int = 16000) -> str:
66
  break
67
  return "\n\n".join(chunks)[:max_chars]
68
 
69
- def _extract_with_vision(images: List[bytes], company_hint: str) -> Dict[str, Any]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  client = _client()
71
  content = [{"type": "text", "text": SYSTEM_JSON}]
72
  if company_hint:
73
  content.append({"type": "text", "text": f"会社名の候補: {company_hint}"})
74
  for im in images:
75
  content.append({"type": "input_image", "image_url": f"data:image/png;base64,{_b64(im)}"})
 
76
  resp = client.chat.completions.create(
77
  model=OPENAI_MODEL_VISION,
78
  messages=[
@@ -84,9 +175,9 @@ def _extract_with_vision(images: List[bytes], company_hint: str) -> Dict[str, An
84
  )
85
  return json.loads(resp.choices[0].message.content)
86
 
87
- def _extract_with_text(text_blob: str, company_hint: str) -> Dict[str, Any]:
88
  client = _client()
89
- prompt = f"{SYSTEM_JSON}\n\n以下は決算書のテキストです。上記の JSON だけを返してください。\n\n{text_blob or ''}"
90
  resp = client.chat.completions.create(
91
  model=OPENAI_MODEL_TEXT,
92
  messages=[
@@ -98,8 +189,9 @@ def _extract_with_text(text_blob: str, company_hint: str) -> Dict[str, Any]:
98
  )
99
  return json.loads(resp.choices[0].message.content)
100
 
101
- def fin_to_df(fin: Dict[str, Any]):
102
- import pandas as pd
 
103
  rows = []
104
  def add(cat, d):
105
  for k, v in (d or {}).items():
@@ -109,29 +201,48 @@ def fin_to_df(fin: Dict[str, Any]):
109
  add("cash_flows", fin.get("cash_flows"))
110
  return pd.DataFrame(rows, columns=["category", "item", "value"])
111
 
112
- def parse_pdf(files: List[str], company: str = "", use_vision: bool = True) -> Tuple[Dict[str,Any], "pd.DataFrame", Dict[str,Any], str]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  """
114
  返り値: (fin_scaled, df_scaled, meta, log)
115
  meta: {"unit_label","unit_scale","unit_hits":[...],"warnings":[...]}
116
  """
117
  logs = []
118
- if not files:
 
119
  raise RuntimeError("PDF をアップロードしてください。")
120
 
121
  # 1) テキスト連結(単位推定の根拠に使用)
122
  all_text = ""
123
- for p in files:
124
  t = pdf_to_text(p)
125
  all_text += ("\n\n" + t) if all_text else t
126
- scale, label, hits = detect_unit_scale(all_text)
127
- logs.append(f"[unit] 推定: {label} (×{scale:,.0f}) | hits={hits[:5]}{'...' if len(hits)>5 else ''}")
128
 
129
- # 2) まず Vision を試し、失敗したら Text にフォールバック
130
  fin_raw: Dict[str, Any]
131
  if use_vision:
132
  try:
133
  all_images: List[bytes] = []
134
- for p in files:
135
  all_images += pdf_to_images(p, dpi=220, max_pages=6)
136
  fin_raw = _extract_with_vision(all_images, company)
137
  logs.append("[extract] Vision 解析に成功")
@@ -141,25 +252,18 @@ def parse_pdf(files: List[str], company: str = "", use_vision: bool = True) -> T
141
  else:
142
  fin_raw = _extract_with_text(all_text, company)
143
 
144
- # 3) 単位換算を適用
145
- fin_scaled = apply_unit_scale(fin_raw, scale)
146
-
147
- # 一貫性チェック(総資産 ≈ 負債 + 純資産)
148
- warn = []
149
- try:
150
- ta = float(fin_scaled.get("balance_sheet",{}).get("total_assets") or 0)
151
- tl = float(fin_scaled.get("balance_sheet",{}).get("total_liabilities") or 0)
152
- te = float(fin_scaled.get("balance_sheet",{}).get("total_equity") or 0)
153
- if ta and (abs(ta - (tl + te)) / max(ta,1)) > 0.05:
154
- warn.append("B/S 整合性に差分 >5%(単位や抽出精度を確認)")
155
- except Exception:
156
- pass
157
 
 
158
  meta = {
159
- "unit_label": label,
160
- "unit_scale": scale,
161
- "unit_hits": hits,
162
- "warnings": warn,
163
  }
164
- df = fin_to_df(fin_scaled)
165
- return fin_scaled, df, meta, "\n".join(logs + [f"[warn] {w}" for w in warn])
 
 
 
1
  # core/extract.py
2
  from __future__ import annotations
3
+ import os, io, re, json, base64, shutil
4
  from typing import List, Dict, Any, Tuple
5
 
6
+ import pandas as pd
7
  from pdf2image import convert_from_path
8
+ import pdfplumber
9
+
10
+ # OpenAI SDK v1系を想定(requirements側で httpx==0.27.2 を厳格指定してください)
11
  from openai import OpenAI
12
 
 
13
 
14
  OPENAI_MODEL_VISION = os.environ.get("OPENAI_VISION_MODEL", "gpt-4o-mini")
15
  OPENAI_MODEL_TEXT = os.environ.get("OPENAI_TEXT_MODEL", "gpt-4o-mini")
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
+ # ---------- 内部ユーティリティ ----------
19
  def _b64(img: bytes) -> str:
 
20
  return base64.b64encode(img).decode("utf-8")
21
 
22
  def _client() -> OpenAI:
23
+ # httpxバージョンの相性チェック(0.28系だとproxies引数でコケる)
24
+ try:
25
+ import httpx
26
+ if not httpx.__version__.startswith("0.27."):
27
+ raise RuntimeError(
28
+ f"httpx==0.27.x を利用してください(現在: {httpx.__version__})。"
29
+ " requirements.txt に `httpx==0.27.2` を明記。"
30
+ )
31
+ except Exception as e:
32
+ # ここで例外にしてUIに表示する(診断しやすくする)
33
+ raise e
34
+
35
  key = os.environ.get("OPENAI_API_KEY")
36
  if not key:
37
+ raise RuntimeError("OPENAI_API_KEY が未設定です。Spaces Secrets に追加してください。")
38
+ return OpenAI(api_key=key, timeout=60)
39
+
40
+ def _coerce_filepaths(files) -> List[str]:
41
+ """Gradioから渡るfilesを確実にパス配列へ正規化"""
42
+ paths: List[str] = []
43
+ if not files:
44
+ return []
45
+ if isinstance(files, str):
46
+ return [files] if files.lower().endswith(".pdf") and os.path.exists(files) else []
47
+ for f in files:
48
+ if isinstance(f, str):
49
+ p = f
50
+ elif isinstance(f, dict) and "name" in f:
51
+ p = f["name"]
52
+ elif hasattr(f, "name"):
53
+ p = getattr(f, "name")
54
+ elif isinstance(f, tuple) and f and isinstance(f[0], str):
55
+ p = f[0]
56
+ else:
57
+ p = None
58
+ if p and p.lower().endswith(".pdf") and os.path.exists(p):
59
+ paths.append(p)
60
+ return paths
61
 
62
+
63
+ # ---------- PDF -> 画像 / テキスト ----------
64
  def pdf_to_images(pdf_path: str, dpi: int = 220, max_pages: int = 6) -> List[bytes]:
65
+ images = convert_from_path(pdf_path, dpi=dpi, fmt="png")
66
  out: List[bytes] = []
67
+ for i, im in enumerate(images):
68
  if i >= max_pages:
69
  break
70
  buf = io.BytesIO()
71
+ im.save(buf, format="PNG")
72
  out.append(buf.getvalue())
73
  return out
74
 
75
+ def pdf_to_text(pdf_path: str, max_chars: int = 15000) -> str:
76
  chunks: List[str] = []
77
  with pdfplumber.open(pdf_path) as pdf:
78
  for i, page in enumerate(pdf.pages):
 
83
  break
84
  return "\n\n".join(chunks)[:max_chars]
85
 
86
+
87
+ # ---------- 単位推定 ----------
88
+ _UNIT_MAP = {
89
+ "円": 1,
90
+ "千円": 1_000,
91
+ "万円": 10_000,
92
+ "百万円": 1_000_000,
93
+ "million yen": 1_000_000,
94
+ "thousand yen": 1_000,
95
+ "yen": 1,
96
+ }
97
+ _UNIT_PATTERNS = [
98
+ r"単位\s*[::]?\s*(百万円|千円|万円|円)",
99
+ r"単位\s*[((]\s*(百万円|千円|万円|円)\s*[))]",
100
+ r"(unit|units)\s*[::]?\s*(million yen|thousand yen|yen)",
101
+ ]
102
+
103
+ def detect_unit(text: str) -> Tuple[str, int, list[str]]:
104
+ """
105
+ PDFテキストから単位を推定。最頻ヒットを採用���無ければデフォルト百万円。
106
+ 戻り値: (label, scale, hits[])
107
+ """
108
+ hits: list[str] = []
109
+ for pat in _UNIT_PATTERNS:
110
+ for m in re.finditer(pat, text, flags=re.I):
111
+ g = m.group(1).lower()
112
+ # 日本語はそのまま、英語は小文字のまま map
113
+ if g in ["百万円","千円","万円","円"]:
114
+ hits.append(g)
115
+ elif g in ["million yen","thousand yen","yen"]:
116
+ hits.append(g)
117
+
118
+ if hits:
119
+ # 最頻値
120
+ from collections import Counter
121
+ label = Counter(hits).most_common(1)[0][0]
122
+ # 表示は日本語優先
123
+ disp = {"million yen":"百万円","thousand yen":"千円","yen":"円"}.get(label, label)
124
+ scale = _UNIT_MAP[label]
125
+ return disp, scale, hits
126
+
127
+ # 「千円未満切捨て」などの補助ヒント
128
+ if re.search(r"千円.*切[捨下]", text):
129
+ return "千円", 1_000, ["補助ヒント: 千円未満切捨て"]
130
+ if re.search(r"百万円.*切[捨下]", text):
131
+ return "百万円", 1_000_000, ["補助ヒント: 百万円切捨て"]
132
+
133
+ # 何も見つからなければ百万円を既定
134
+ return "百万円", 1_000_000, []
135
+
136
+
137
+ # ---------- OpenAI で表読み取り ----------
138
+ SYSTEM_JSON = """あなたは有能な財務アナリストです。
139
+ 与えられた決算書(画像またはテキスト)から、次の厳密な JSON 構造のみを日本語の単位なし・半角数値で返してください。分からない項目は null。
140
+ {
141
+ "company": {"name": null},
142
+ "period": {"start_date": null, "end_date": null},
143
+ "balance_sheet": {
144
+ "total_assets": null, "total_liabilities": null, "total_equity": null,
145
+ "current_assets": null, "fixed_assets": null,
146
+ "current_liabilities": null, "long_term_liabilities": null
147
+ },
148
+ "income_statement": {
149
+ "sales": null, "cost_of_sales": null, "gross_profit": null,
150
+ "operating_expenses": null, "operating_income": null,
151
+ "ordinary_income": null, "net_income": null
152
+ },
153
+ "cash_flows": {
154
+ "operating_cash_flow": null, "investing_cash_flow": null, "financing_cash_flow": null
155
+ }
156
+ }
157
+ """
158
+
159
+ def _extract_with_vision(images: List[bytes], company_hint: str = "") -> Dict[str, Any]:
160
  client = _client()
161
  content = [{"type": "text", "text": SYSTEM_JSON}]
162
  if company_hint:
163
  content.append({"type": "text", "text": f"会社名の候補: {company_hint}"})
164
  for im in images:
165
  content.append({"type": "input_image", "image_url": f"data:image/png;base64,{_b64(im)}"})
166
+
167
  resp = client.chat.completions.create(
168
  model=OPENAI_MODEL_VISION,
169
  messages=[
 
175
  )
176
  return json.loads(resp.choices[0].message.content)
177
 
178
+ def _extract_with_text(text: str, company_hint: str = "") -> Dict[str, Any]:
179
  client = _client()
180
+ prompt = f"{SYSTEM_JSON}\n\n以下は決算書のテキストです。上記の JSON だけを返してください。\n\n{text or ''}"
181
  resp = client.chat.completions.create(
182
  model=OPENAI_MODEL_TEXT,
183
  messages=[
 
189
  )
190
  return json.loads(resp.choices[0].message.content)
191
 
192
+
193
+ # ---------- JSON<->DataFrame 変換とスケーリング ----------
194
+ def fin_to_df(fin: Dict[str, Any]) -> pd.DataFrame:
195
  rows = []
196
  def add(cat, d):
197
  for k, v in (d or {}).items():
 
201
  add("cash_flows", fin.get("cash_flows"))
202
  return pd.DataFrame(rows, columns=["category", "item", "value"])
203
 
204
+ def _scale_fin(fin: Dict[str, Any], scale: float) -> Dict[str, Any]:
205
+ def sc_val(v):
206
+ if v in (None, "", "null"):
207
+ return None
208
+ try:
209
+ return float(v) * scale
210
+ except Exception:
211
+ return None
212
+
213
+ out = json.loads(json.dumps(fin)) # shallow copy
214
+ for sec in ("balance_sheet", "income_statement", "cash_flows"):
215
+ if sec in out and isinstance(out[sec], dict):
216
+ for k, v in out[sec].items():
217
+ out[sec][k] = sc_val(v)
218
+ return out
219
+
220
+
221
+ # ---------- 入口:PDF解析 ----------
222
+ def parse_pdf(files, company: str = "", use_vision: bool = True) -> Tuple[Dict[str,Any], "pd.DataFrame", Dict[str,Any], str]:
223
  """
224
  返り値: (fin_scaled, df_scaled, meta, log)
225
  meta: {"unit_label","unit_scale","unit_hits":[...],"warnings":[...]}
226
  """
227
  logs = []
228
+ paths = _coerce_filepaths(files)
229
+ if not paths:
230
  raise RuntimeError("PDF をアップロードしてください。")
231
 
232
  # 1) テキスト連結(単位推定の根拠に使用)
233
  all_text = ""
234
+ for p in paths:
235
  t = pdf_to_text(p)
236
  all_text += ("\n\n" + t) if all_text else t
237
+ unit_label, unit_scale, unit_hits = detect_unit(all_text)
238
+ logs.append(f"[unit] 推定: {unit_label} (×{unit_scale:,}) / hits: {unit_hits[:5]}{'...' if len(unit_hits)>5 else ''}")
239
 
240
+ # 2) 画像化 + Vision ダメならテキストへ
241
  fin_raw: Dict[str, Any]
242
  if use_vision:
243
  try:
244
  all_images: List[bytes] = []
245
+ for p in paths:
246
  all_images += pdf_to_images(p, dpi=220, max_pages=6)
247
  fin_raw = _extract_with_vision(all_images, company)
248
  logs.append("[extract] Vision 解析に成功")
 
252
  else:
253
  fin_raw = _extract_with_text(all_text, company)
254
 
255
+ # 3) 単位スケーリング
256
+ fin_scaled = _scale_fin(fin_raw, unit_scale)
257
+ df_scaled = fin_to_df(fin_scaled)
 
 
 
 
 
 
 
 
 
 
258
 
259
+ # 4) メタ情報
260
  meta = {
261
+ "unit_label": unit_label,
262
+ "unit_scale": unit_scale,
263
+ "unit_hits": unit_hits,
264
+ "warnings": [],
265
  }
266
+
267
+ # 5) ログ
268
+ log = "\n".join(logs)
269
+ return fin_scaled, df_scaled, meta, log