Corin1998 commited on
Commit
8752b28
·
verified ·
1 Parent(s): ca7a26c

Upload 7 files

Browse files
Files changed (4) hide show
  1. app.py +102 -9
  2. finance_core.py +16 -3
  3. llm_extract.py +14 -14
  4. requirements.txt +0 -1
app.py CHANGED
@@ -1,9 +1,10 @@
1
  #!/usr/bin/env python3
2
  # -*- coding: utf-8 -*-
3
  import os
 
4
  import json
5
  from datetime import datetime
6
- from typing import List, Optional, Dict, Any
7
 
8
  import gradio as gr
9
  import yaml
@@ -67,7 +68,7 @@ growth:
67
 
68
  POLICIES = _load_policies()
69
 
70
-
71
  def _read_file_input(f):
72
  """Return (filename, bytes) from various Gradio file input shapes."""
73
  # Path-like / NamedString (File component with type="filepath")
@@ -106,6 +107,85 @@ def _read_file_input(f):
106
  pass
107
  raise ValueError(f"Unsupported file input type: {type(f)}")
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
  def analyze(
111
  files: List,
@@ -127,7 +207,7 @@ def analyze(
127
  if not files or len(files) == 0:
128
  raise gr.Error("決算書ファイル(PDF/画像)を1つ以上アップロードしてください。")
129
 
130
- # 1) Upload files to OpenAI and extract structured financials via vision + Structured Outputs
131
  try:
132
  file_ids = []
133
  for f in files:
@@ -136,7 +216,7 @@ def analyze(
136
  except Exception as e:
137
  raise gr.Error(f"ファイルのアップロードに失敗しました: {e}")
138
 
139
- # Local paths for text fallback (if available)
140
  local_paths = []
141
  for f in files:
142
  if isinstance(f, (str, bytes)) or hasattr(f, "__fspath__"):
@@ -172,6 +252,18 @@ def analyze(
172
  if industry_hint:
173
  extract.industry = industry_hint
174
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  # 2) Compute derived ratios and risk score
176
  ratios = compute_ratios(extract)
177
 
@@ -196,12 +288,11 @@ def analyze(
196
  decisions["investment"] = investment_decision(extract, ratios, POLICIES, multiples)
197
 
198
  # 4) Build a combined report (dict) and return displays
199
- report = build_report_dict(extract, ratios, decisions)
200
  report_json = json.dumps(report, ensure_ascii=False, indent=2)
201
 
202
- # Save a downloadable JSON (ensure directory exists)
203
  ts = datetime.utcnow().strftime("%Y%m%d-%H%M%S")
204
- # ✅ Gradio v5 の仕様に合わせ、既定保存先は /tmp に
205
  data_dir = os.environ.get("HF_DATA_DIR", "/tmp")
206
  os.makedirs(data_dir, exist_ok=True)
207
  out_path = os.path.join(data_dir, f"report-{ts}.json")
@@ -218,6 +309,9 @@ def analyze(
218
  if extract.fiscal_year_end:
219
  summary_md.append(f"### 決算期末\n{extract.fiscal_year_end}")
220
 
 
 
 
221
  summary_md.append("### 指標(主要)")
222
  summary_md.append(
223
  f"- 売上高: {ratios.get('revenue')}\n"
@@ -299,5 +393,4 @@ def build_ui():
299
 
300
  if __name__ == "__main__":
301
  demo = build_ui()
302
- # ✅ /tmp /mnt/data の両方を(必要なら)許可。/tmp 既定保存先なのでこのままでOK。
303
- demo.launch(allowed_paths=["/tmp", "/mnt/data"])
 
1
  #!/usr/bin/env python3
2
  # -*- coding: utf-8 -*-
3
  import os
4
+ import re
5
  import json
6
  from datetime import datetime
7
+ from typing import List, Optional, Dict, Any, Tuple
8
 
9
  import gradio as gr
10
  import yaml
 
68
 
69
  POLICIES = _load_policies()
70
 
71
+ # --- Gradio ファイル入力の型差異を吸収 ---
72
  def _read_file_input(f):
73
  """Return (filename, bytes) from various Gradio file input shapes."""
74
  # Path-like / NamedString (File component with type="filepath")
 
107
  pass
108
  raise ValueError(f"Unsupported file input type: {type(f)}")
109
 
110
+ # --- 単位検出&換算ヘルパー(PDF本文を走査して「単位:百万円」等を検出) ---
111
+ def _concat_pdf_text(paths: List[str], max_chars: int = 180_000) -> str:
112
+ try:
113
+ from pypdf import PdfReader
114
+ except Exception:
115
+ return ""
116
+ out = []
117
+ total = 0
118
+ for p in paths:
119
+ try:
120
+ r = PdfReader(p)
121
+ for page in r.pages:
122
+ t = page.extract_text() or ""
123
+ if t:
124
+ out.append(t)
125
+ total += len(t)
126
+ if total > max_chars:
127
+ break
128
+ except Exception:
129
+ continue
130
+ if total > max_chars:
131
+ break
132
+ return "\n\n".join(out)[:max_chars]
133
+
134
+ def detect_unit_multiplier_from_paths(paths: List[str]) -> Tuple[float, str]:
135
+ """
136
+ PDF本文から単位を推定して (乗数, ラベル) を返す。
137
+ 例: ('百万円'→1_000_000, '千円'→1_000, '万円'→10_000, '円'→1,
138
+ 'millions'→1_000_000, 'thousands'→1_000)
139
+ 見つからなければ (1, '不明')
140
+ """
141
+ text = _concat_pdf_text(paths)
142
+ if not text:
143
+ return 1.0, "不明"
144
+
145
+ lower = text.lower()
146
+
147
+ # 日本語パターン(優先度:百万円→千円→万円→円)
148
+ if re.search(r"単位[::]\s*百万円", text) or re.search(r"(百万円)", text):
149
+ return 1_000_000.0, "百万円"
150
+ if re.search(r"単位[::]\s*千円", text) or re.search(r"(千円)", text):
151
+ return 1_000.0, "千円"
152
+ if re.search(r"単位[::]\s*万円", text) or re.search(r"(万円)", text):
153
+ return 10_000.0, "万円"
154
+ if re.search(r"単位[::]\s*円", text) or re.search(r"(円)", text):
155
+ return 1.0, "円"
156
+
157
+ # 英語パターン
158
+ if re.search(r"in\s+millions\s+of\s+(yen|jpy|usd|dollars?)", lower) or re.search(r"\b(jpy|¥|\$|usd)\s*\(\s*millions?\s*\)", lower):
159
+ return 1_000_000.0, "millions"
160
+ if re.search(r"in\s+thousands\s+of\s+(yen|jpy|usd|dollars?)", lower) or re.search(r"\b(jpy|¥|\$|usd)\s*\(\s*thousands?\s*\)", lower):
161
+ return 1_000.0, "thousands"
162
+
163
+ # コンテキストで単独出現
164
+ if re.search(r"百万円", text):
165
+ return 1_000_000.0, "百万円"
166
+
167
+ return 1.0, "不明"
168
+
169
+ _NUM_FIELDS = [
170
+ "revenue","cogs","ebit","depreciation","ebitda","net_income",
171
+ "cash_and_equivalents","accounts_receivable","inventory","accounts_payable",
172
+ "current_assets","current_liabilities","total_assets","total_equity",
173
+ "total_debt","interest_expense",
174
+ ]
175
+
176
+ def scale_extract_inplace(extract: FinancialExtract, multiplier: float) -> None:
177
+ """抽出済みオブジェクトの数値を指定乗数でインプレース換算する(Noneは無視)。"""
178
+ if not multiplier or multiplier == 1:
179
+ return
180
+ for period in extract.periods:
181
+ for k in _NUM_FIELDS:
182
+ v = getattr(period, k)
183
+ if v is not None:
184
+ try:
185
+ setattr(period, k, float(v) * float(multiplier))
186
+ except Exception:
187
+ pass
188
+
189
 
190
  def analyze(
191
  files: List,
 
207
  if not files or len(files) == 0:
208
  raise gr.Error("決算書ファイル(PDF/画像)を1つ以上アップロードしてください。")
209
 
210
+ # 1) Upload files to OpenAI and extract structured financials via vision
211
  try:
212
  file_ids = []
213
  for f in files:
 
216
  except Exception as e:
217
  raise gr.Error(f"ファイルのアップロードに失敗しました: {e}")
218
 
219
+ # Local paths for text & unit fallback
220
  local_paths = []
221
  for f in files:
222
  if isinstance(f, (str, bytes)) or hasattr(f, "__fspath__"):
 
252
  if industry_hint:
253
  extract.industry = industry_hint
254
 
255
+ # --- 単位検出&換算(円/ドル等の素単位に正規化)---
256
+ unit_info = {"source_label": "不明", "multiplier": 1}
257
+ try:
258
+ if local_paths:
259
+ mult, label = detect_unit_multiplier_from_paths(local_paths)
260
+ unit_info = {"source_label": label, "multiplier": int(mult)}
261
+ if mult and mult != 1:
262
+ scale_extract_inplace(extract, mult)
263
+ except Exception as e:
264
+ if debug:
265
+ print(f"[unit-detect] warning: {e}")
266
+
267
  # 2) Compute derived ratios and risk score
268
  ratios = compute_ratios(extract)
269
 
 
288
  decisions["investment"] = investment_decision(extract, ratios, POLICIES, multiples)
289
 
290
  # 4) Build a combined report (dict) and return displays
291
+ report = build_report_dict(extract, ratios, decisions, unit_info=unit_info)
292
  report_json = json.dumps(report, ensure_ascii=False, indent=2)
293
 
294
+ # Save a downloadable JSON (ensure directory exists; Gradio v5 は /tmp を推奨)
295
  ts = datetime.utcnow().strftime("%Y%m%d-%H%M%S")
 
296
  data_dir = os.environ.get("HF_DATA_DIR", "/tmp")
297
  os.makedirs(data_dir, exist_ok=True)
298
  out_path = os.path.join(data_dir, f"report-{ts}.json")
 
309
  if extract.fiscal_year_end:
310
  summary_md.append(f"### 決算期末\n{extract.fiscal_year_end}")
311
 
312
+ summary_md.append("### 単位(検出結果)")
313
+ summary_md.append(f"- ソース表記: {unit_info['source_label']} / 乗数: x{unit_info['multiplier']:,}" + ("(数値は換算済み)" if unit_info["multiplier"] != 1 else ""))
314
+
315
  summary_md.append("### 指標(主要)")
316
  summary_md.append(
317
  f"- 売上高: {ratios.get('revenue')}\n"
 
393
 
394
  if __name__ == "__main__":
395
  demo = build_ui()
396
+ demo.launch(allowed_paths=["/tmp", "/mnt/data"]) # /tmp 既定保存先にしつつ、必要ら /mnt/data も許可
 
finance_core.py CHANGED
@@ -251,11 +251,24 @@ def investment_decision(extract: FinancialExtract, ratios: Dict[str, Any], polic
251
  "recommended_check_size": check, "recommended_check_size_display": _fmt_currency(check, currency),
252
  "attractiveness": attractiveness, "growth_label": glabel}
253
 
254
- def build_report_dict(extract: FinancialExtract, ratios: Dict[str, Any], decisions: Dict[str, Any]) -> Dict[str, Any]:
255
- return {
256
- "metadata": {"company_name": extract.company_name, "industry": extract.industry, "currency": extract.currency, "fiscal_year_end": extract.fiscal_year_end},
 
 
 
 
 
 
 
 
 
 
257
  "extracted": extract.dict(),
258
  "ratios": ratios,
259
  "decisions": decisions,
260
  "disclaimer": "本ツールはAIによる推定・一般的な計算式に基づく参考提案であり、投資勧誘・融資約定・与信保証を目的としたものではありません。最終判断は自己責任で、必要に応じて専門家の確認を行ってください。",
261
  }
 
 
 
 
251
  "recommended_check_size": check, "recommended_check_size_display": _fmt_currency(check, currency),
252
  "attractiveness": attractiveness, "growth_label": glabel}
253
 
254
+ def build_report_dict(
255
+ extract: FinancialExtract,
256
+ ratios: Dict[str, Any],
257
+ decisions: Dict[str, Any],
258
+ unit_info: Optional[Dict[str, Any]] = None, # ← 追加
259
+ ) -> Dict[str, Any]:
260
+ out = {
261
+ "metadata": {
262
+ "company_name": extract.company_name,
263
+ "industry": extract.industry,
264
+ "currency": extract.currency,
265
+ "fiscal_year_end": extract.fiscal_year_end
266
+ },
267
  "extracted": extract.dict(),
268
  "ratios": ratios,
269
  "decisions": decisions,
270
  "disclaimer": "本ツールはAIによる推定・一般的な計算式に基づく参考提案であり、投資勧誘・融資約定・与信保証を目的としたものではありません。最終判断は自己責任で、必要に応じて専門家の確認を行ってください。",
271
  }
272
+ if unit_info:
273
+ out["unit_detection"] = unit_info
274
+ return out
llm_extract.py CHANGED
@@ -43,9 +43,11 @@ def _strip_code_fences(s: str) -> str:
43
  return s2.strip()
44
 
45
  def _pdf_text_concat(paths: List[str], max_chars: int = 180_000) -> str:
46
- """pypdf でテキスト抽出(画像PDFは空になることあり)。"""
47
- from pypdf import PdfReader
48
- out = []
 
 
49
  for p in paths:
50
  try:
51
  r = PdfReader(p)
@@ -53,14 +55,14 @@ def _pdf_text_concat(paths: List[str], max_chars: int = 180_000) -> str:
53
  t = page.extract_text() or ""
54
  if t:
55
  out.append(t)
56
- if sum(len(x) for x in out) > max_chars:
57
- break
 
58
  except Exception:
59
- # 1つ失敗しても続行
60
  continue
61
- text = "\n\n".join(out)
62
- # 長すぎる場合は切り詰め
63
- return text[:max_chars]
64
 
65
  def _json_loads_strict(raw: str) -> dict:
66
  try:
@@ -75,7 +77,7 @@ def extract_financials_from_files(
75
  currency_hint: Optional[str],
76
  model: str = VISION_MODEL,
77
  debug: bool = False,
78
- local_paths: Optional[List[str]] = None, # ← 追加
79
  ) -> FinancialExtract:
80
 
81
  schema = FinancialExtract.model_json_schema()
@@ -93,7 +95,7 @@ def extract_financials_from_files(
93
  if currency_hint:
94
  base_user += f"\nCurrency hint: {currency_hint}"
95
 
96
- # 1) まずは Vision + file_id で試す
97
  try:
98
  resp = client.responses.create(
99
  model=model,
@@ -106,7 +108,6 @@ def extract_financials_from_files(
106
  ],
107
  },
108
  ],
109
- # 一部ゲートウェイでは response_format 未対応のため入れない
110
  max_output_tokens=2048,
111
  )
112
  raw = _safe_output_text(resp)
@@ -114,13 +115,12 @@ def extract_financials_from_files(
114
  return FinancialExtract.model_validate(data)
115
 
116
  except Exception as e_vision:
117
- # 2) フォールバック:テキスト抽出 → TEXT_MODEL で構造化
118
  if not local_paths:
119
  raise RuntimeError(f"Vision抽出に失敗し、かつローカルPDFテキストがありません: {e_vision}")
120
 
121
  text = _pdf_text_concat(local_paths)
122
  if not text:
123
- # 画像ベースPDFの可能性 → Visionエラー内容を返す
124
  raise RuntimeError(f"PDFが画像ベースの可能性があり、テキスト抽出できません。Vision側エラー: {e_vision}")
125
 
126
  user2 = (
 
43
  return s2.strip()
44
 
45
  def _pdf_text_concat(paths: List[str], max_chars: int = 180_000) -> str:
46
+ try:
47
+ from pypdf import PdfReader
48
+ except Exception:
49
+ return ""
50
+ out, total = [], 0
51
  for p in paths:
52
  try:
53
  r = PdfReader(p)
 
55
  t = page.extract_text() or ""
56
  if t:
57
  out.append(t)
58
+ total += len(t)
59
+ if total > max_chars:
60
+ break
61
  except Exception:
 
62
  continue
63
+ if total > max_chars:
64
+ break
65
+ return "\n\n".join(out)[:max_chars]
66
 
67
  def _json_loads_strict(raw: str) -> dict:
68
  try:
 
77
  currency_hint: Optional[str],
78
  model: str = VISION_MODEL,
79
  debug: bool = False,
80
+ local_paths: Optional[List[str]] = None, # ← フォールバック用
81
  ) -> FinancialExtract:
82
 
83
  schema = FinancialExtract.model_json_schema()
 
95
  if currency_hint:
96
  base_user += f"\nCurrency hint: {currency_hint}"
97
 
98
+ # 1) Vision + file_id で試す(response_format 未使用)
99
  try:
100
  resp = client.responses.create(
101
  model=model,
 
108
  ],
109
  },
110
  ],
 
111
  max_output_tokens=2048,
112
  )
113
  raw = _safe_output_text(resp)
 
115
  return FinancialExtract.model_validate(data)
116
 
117
  except Exception as e_vision:
118
+ # 2) テキスト抽出 → TEXT_MODEL で構造化
119
  if not local_paths:
120
  raise RuntimeError(f"Vision抽出に失敗し、かつローカルPDFテキストがありません: {e_vision}")
121
 
122
  text = _pdf_text_concat(local_paths)
123
  if not text:
 
124
  raise RuntimeError(f"PDFが画像ベースの可能性があり、テキスト抽出できません。Vision側エラー: {e_vision}")
125
 
126
  user2 = (
requirements.txt CHANGED
@@ -5,4 +5,3 @@ pyyaml>=6.0.1
5
  numpy>=1.26.4
6
  pandas>=2.2.2
7
  pypdf>=4.2.0
8
-
 
5
  numpy>=1.26.4
6
  pandas>=2.2.2
7
  pypdf>=4.2.0