Upload 7 files
Browse files- app.py +102 -9
- finance_core.py +16 -3
- llm_extract.py +14 -14
- requirements.txt +0 -1
app.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
# -*- coding: utf-8 -*-
|
| 3 |
import os
|
|
|
|
| 4 |
import json
|
| 5 |
from datetime import datetime
|
| 6 |
-
from typing import List, Optional, Dict, Any
|
| 7 |
|
| 8 |
import gradio as gr
|
| 9 |
import yaml
|
|
@@ -67,7 +68,7 @@ growth:
|
|
| 67 |
|
| 68 |
POLICIES = _load_policies()
|
| 69 |
|
| 70 |
-
|
| 71 |
def _read_file_input(f):
|
| 72 |
"""Return (filename, bytes) from various Gradio file input shapes."""
|
| 73 |
# Path-like / NamedString (File component with type="filepath")
|
|
@@ -106,6 +107,85 @@ def _read_file_input(f):
|
|
| 106 |
pass
|
| 107 |
raise ValueError(f"Unsupported file input type: {type(f)}")
|
| 108 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
def analyze(
|
| 111 |
files: List,
|
|
@@ -127,7 +207,7 @@ def analyze(
|
|
| 127 |
if not files or len(files) == 0:
|
| 128 |
raise gr.Error("決算書ファイル(PDF/画像)を1つ以上アップロードしてください。")
|
| 129 |
|
| 130 |
-
# 1) Upload files to OpenAI and extract structured financials via vision
|
| 131 |
try:
|
| 132 |
file_ids = []
|
| 133 |
for f in files:
|
|
@@ -136,7 +216,7 @@ def analyze(
|
|
| 136 |
except Exception as e:
|
| 137 |
raise gr.Error(f"ファイルのアップロードに失敗しました: {e}")
|
| 138 |
|
| 139 |
-
# Local paths for text
|
| 140 |
local_paths = []
|
| 141 |
for f in files:
|
| 142 |
if isinstance(f, (str, bytes)) or hasattr(f, "__fspath__"):
|
|
@@ -172,6 +252,18 @@ def analyze(
|
|
| 172 |
if industry_hint:
|
| 173 |
extract.industry = industry_hint
|
| 174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
# 2) Compute derived ratios and risk score
|
| 176 |
ratios = compute_ratios(extract)
|
| 177 |
|
|
@@ -196,12 +288,11 @@ def analyze(
|
|
| 196 |
decisions["investment"] = investment_decision(extract, ratios, POLICIES, multiples)
|
| 197 |
|
| 198 |
# 4) Build a combined report (dict) and return displays
|
| 199 |
-
report = build_report_dict(extract, ratios, decisions)
|
| 200 |
report_json = json.dumps(report, ensure_ascii=False, indent=2)
|
| 201 |
|
| 202 |
-
# Save a downloadable JSON (ensure directory exists)
|
| 203 |
ts = datetime.utcnow().strftime("%Y%m%d-%H%M%S")
|
| 204 |
-
# ✅ Gradio v5 の仕様に合わせ、既定保存先は /tmp に
|
| 205 |
data_dir = os.environ.get("HF_DATA_DIR", "/tmp")
|
| 206 |
os.makedirs(data_dir, exist_ok=True)
|
| 207 |
out_path = os.path.join(data_dir, f"report-{ts}.json")
|
|
@@ -218,6 +309,9 @@ def analyze(
|
|
| 218 |
if extract.fiscal_year_end:
|
| 219 |
summary_md.append(f"### 決算期末\n{extract.fiscal_year_end}")
|
| 220 |
|
|
|
|
|
|
|
|
|
|
| 221 |
summary_md.append("### 指標(主要)")
|
| 222 |
summary_md.append(
|
| 223 |
f"- 売上高: {ratios.get('revenue')}\n"
|
|
@@ -299,5 +393,4 @@ def build_ui():
|
|
| 299 |
|
| 300 |
if __name__ == "__main__":
|
| 301 |
demo = build_ui()
|
| 302 |
-
|
| 303 |
-
demo.launch(allowed_paths=["/tmp", "/mnt/data"])
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
# -*- coding: utf-8 -*-
|
| 3 |
import os
|
| 4 |
+
import re
|
| 5 |
import json
|
| 6 |
from datetime import datetime
|
| 7 |
+
from typing import List, Optional, Dict, Any, Tuple
|
| 8 |
|
| 9 |
import gradio as gr
|
| 10 |
import yaml
|
|
|
|
| 68 |
|
| 69 |
POLICIES = _load_policies()
|
| 70 |
|
| 71 |
+
# --- Gradio ファイル入力の型差異を吸収 ---
|
| 72 |
def _read_file_input(f):
|
| 73 |
"""Return (filename, bytes) from various Gradio file input shapes."""
|
| 74 |
# Path-like / NamedString (File component with type="filepath")
|
|
|
|
| 107 |
pass
|
| 108 |
raise ValueError(f"Unsupported file input type: {type(f)}")
|
| 109 |
|
| 110 |
+
# --- 単位検出&換算ヘルパー(PDF本文を走査して「単位:百万円」等を検出) ---
|
| 111 |
+
def _concat_pdf_text(paths: List[str], max_chars: int = 180_000) -> str:
|
| 112 |
+
try:
|
| 113 |
+
from pypdf import PdfReader
|
| 114 |
+
except Exception:
|
| 115 |
+
return ""
|
| 116 |
+
out = []
|
| 117 |
+
total = 0
|
| 118 |
+
for p in paths:
|
| 119 |
+
try:
|
| 120 |
+
r = PdfReader(p)
|
| 121 |
+
for page in r.pages:
|
| 122 |
+
t = page.extract_text() or ""
|
| 123 |
+
if t:
|
| 124 |
+
out.append(t)
|
| 125 |
+
total += len(t)
|
| 126 |
+
if total > max_chars:
|
| 127 |
+
break
|
| 128 |
+
except Exception:
|
| 129 |
+
continue
|
| 130 |
+
if total > max_chars:
|
| 131 |
+
break
|
| 132 |
+
return "\n\n".join(out)[:max_chars]
|
| 133 |
+
|
| 134 |
+
def detect_unit_multiplier_from_paths(paths: List[str]) -> Tuple[float, str]:
|
| 135 |
+
"""
|
| 136 |
+
PDF本文から単位を推定して (乗数, ラベル) を返す。
|
| 137 |
+
例: ('百万円'→1_000_000, '千円'→1_000, '万円'→10_000, '円'→1,
|
| 138 |
+
'millions'→1_000_000, 'thousands'→1_000)
|
| 139 |
+
見つからなければ (1, '不明')
|
| 140 |
+
"""
|
| 141 |
+
text = _concat_pdf_text(paths)
|
| 142 |
+
if not text:
|
| 143 |
+
return 1.0, "不明"
|
| 144 |
+
|
| 145 |
+
lower = text.lower()
|
| 146 |
+
|
| 147 |
+
# 日本語パターン(優先度:百万円→千円→万円→円)
|
| 148 |
+
if re.search(r"単位[::]\s*百万円", text) or re.search(r"(百万円)", text):
|
| 149 |
+
return 1_000_000.0, "百万円"
|
| 150 |
+
if re.search(r"単位[::]\s*千円", text) or re.search(r"(千円)", text):
|
| 151 |
+
return 1_000.0, "千円"
|
| 152 |
+
if re.search(r"単位[::]\s*万円", text) or re.search(r"(万円)", text):
|
| 153 |
+
return 10_000.0, "万円"
|
| 154 |
+
if re.search(r"単位[::]\s*円", text) or re.search(r"(円)", text):
|
| 155 |
+
return 1.0, "円"
|
| 156 |
+
|
| 157 |
+
# 英語パターン
|
| 158 |
+
if re.search(r"in\s+millions\s+of\s+(yen|jpy|usd|dollars?)", lower) or re.search(r"\b(jpy|¥|\$|usd)\s*\(\s*millions?\s*\)", lower):
|
| 159 |
+
return 1_000_000.0, "millions"
|
| 160 |
+
if re.search(r"in\s+thousands\s+of\s+(yen|jpy|usd|dollars?)", lower) or re.search(r"\b(jpy|¥|\$|usd)\s*\(\s*thousands?\s*\)", lower):
|
| 161 |
+
return 1_000.0, "thousands"
|
| 162 |
+
|
| 163 |
+
# コンテキストで単独出現
|
| 164 |
+
if re.search(r"百万円", text):
|
| 165 |
+
return 1_000_000.0, "百万円"
|
| 166 |
+
|
| 167 |
+
return 1.0, "不明"
|
| 168 |
+
|
| 169 |
+
_NUM_FIELDS = [
|
| 170 |
+
"revenue","cogs","ebit","depreciation","ebitda","net_income",
|
| 171 |
+
"cash_and_equivalents","accounts_receivable","inventory","accounts_payable",
|
| 172 |
+
"current_assets","current_liabilities","total_assets","total_equity",
|
| 173 |
+
"total_debt","interest_expense",
|
| 174 |
+
]
|
| 175 |
+
|
| 176 |
+
def scale_extract_inplace(extract: FinancialExtract, multiplier: float) -> None:
|
| 177 |
+
"""抽出済みオブジェクトの数値を指定乗数でインプレース換算する(Noneは無視)。"""
|
| 178 |
+
if not multiplier or multiplier == 1:
|
| 179 |
+
return
|
| 180 |
+
for period in extract.periods:
|
| 181 |
+
for k in _NUM_FIELDS:
|
| 182 |
+
v = getattr(period, k)
|
| 183 |
+
if v is not None:
|
| 184 |
+
try:
|
| 185 |
+
setattr(period, k, float(v) * float(multiplier))
|
| 186 |
+
except Exception:
|
| 187 |
+
pass
|
| 188 |
+
|
| 189 |
|
| 190 |
def analyze(
|
| 191 |
files: List,
|
|
|
|
| 207 |
if not files or len(files) == 0:
|
| 208 |
raise gr.Error("決算書ファイル(PDF/画像)を1つ以上アップロードしてください。")
|
| 209 |
|
| 210 |
+
# 1) Upload files to OpenAI and extract structured financials via vision
|
| 211 |
try:
|
| 212 |
file_ids = []
|
| 213 |
for f in files:
|
|
|
|
| 216 |
except Exception as e:
|
| 217 |
raise gr.Error(f"ファイルのアップロードに失敗しました: {e}")
|
| 218 |
|
| 219 |
+
# Local paths for text & unit fallback
|
| 220 |
local_paths = []
|
| 221 |
for f in files:
|
| 222 |
if isinstance(f, (str, bytes)) or hasattr(f, "__fspath__"):
|
|
|
|
| 252 |
if industry_hint:
|
| 253 |
extract.industry = industry_hint
|
| 254 |
|
| 255 |
+
# --- 単位検出&換算(円/ドル等の素単位に正規化)---
|
| 256 |
+
unit_info = {"source_label": "不明", "multiplier": 1}
|
| 257 |
+
try:
|
| 258 |
+
if local_paths:
|
| 259 |
+
mult, label = detect_unit_multiplier_from_paths(local_paths)
|
| 260 |
+
unit_info = {"source_label": label, "multiplier": int(mult)}
|
| 261 |
+
if mult and mult != 1:
|
| 262 |
+
scale_extract_inplace(extract, mult)
|
| 263 |
+
except Exception as e:
|
| 264 |
+
if debug:
|
| 265 |
+
print(f"[unit-detect] warning: {e}")
|
| 266 |
+
|
| 267 |
# 2) Compute derived ratios and risk score
|
| 268 |
ratios = compute_ratios(extract)
|
| 269 |
|
|
|
|
| 288 |
decisions["investment"] = investment_decision(extract, ratios, POLICIES, multiples)
|
| 289 |
|
| 290 |
# 4) Build a combined report (dict) and return displays
|
| 291 |
+
report = build_report_dict(extract, ratios, decisions, unit_info=unit_info)
|
| 292 |
report_json = json.dumps(report, ensure_ascii=False, indent=2)
|
| 293 |
|
| 294 |
+
# Save a downloadable JSON (ensure directory exists; Gradio v5 は /tmp を推奨)
|
| 295 |
ts = datetime.utcnow().strftime("%Y%m%d-%H%M%S")
|
|
|
|
| 296 |
data_dir = os.environ.get("HF_DATA_DIR", "/tmp")
|
| 297 |
os.makedirs(data_dir, exist_ok=True)
|
| 298 |
out_path = os.path.join(data_dir, f"report-{ts}.json")
|
|
|
|
| 309 |
if extract.fiscal_year_end:
|
| 310 |
summary_md.append(f"### 決算期末\n{extract.fiscal_year_end}")
|
| 311 |
|
| 312 |
+
summary_md.append("### 単位(検出結果)")
|
| 313 |
+
summary_md.append(f"- ソース表記: {unit_info['source_label']} / 乗数: x{unit_info['multiplier']:,}" + ("(数値は換算済み)" if unit_info["multiplier"] != 1 else ""))
|
| 314 |
+
|
| 315 |
summary_md.append("### 指標(主要)")
|
| 316 |
summary_md.append(
|
| 317 |
f"- 売上高: {ratios.get('revenue')}\n"
|
|
|
|
| 393 |
|
| 394 |
if __name__ == "__main__":
|
| 395 |
demo = build_ui()
|
| 396 |
+
demo.launch(allowed_paths=["/tmp", "/mnt/data"]) # /tmp を既定保存先にしつつ、必要なら /mnt/data も許可
|
|
|
finance_core.py
CHANGED
|
@@ -251,11 +251,24 @@ def investment_decision(extract: FinancialExtract, ratios: Dict[str, Any], polic
|
|
| 251 |
"recommended_check_size": check, "recommended_check_size_display": _fmt_currency(check, currency),
|
| 252 |
"attractiveness": attractiveness, "growth_label": glabel}
|
| 253 |
|
| 254 |
-
def build_report_dict(
|
| 255 |
-
|
| 256 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
"extracted": extract.dict(),
|
| 258 |
"ratios": ratios,
|
| 259 |
"decisions": decisions,
|
| 260 |
"disclaimer": "本ツールはAIによる推定・一般的な計算式に基づく参考提案であり、投資勧誘・融資約定・与信保証を目的としたものではありません。最終判断は自己責任で、必要に応じて専門家の確認を行ってください。",
|
| 261 |
}
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
"recommended_check_size": check, "recommended_check_size_display": _fmt_currency(check, currency),
|
| 252 |
"attractiveness": attractiveness, "growth_label": glabel}
|
| 253 |
|
| 254 |
+
def build_report_dict(
|
| 255 |
+
extract: FinancialExtract,
|
| 256 |
+
ratios: Dict[str, Any],
|
| 257 |
+
decisions: Dict[str, Any],
|
| 258 |
+
unit_info: Optional[Dict[str, Any]] = None, # ← 追加
|
| 259 |
+
) -> Dict[str, Any]:
|
| 260 |
+
out = {
|
| 261 |
+
"metadata": {
|
| 262 |
+
"company_name": extract.company_name,
|
| 263 |
+
"industry": extract.industry,
|
| 264 |
+
"currency": extract.currency,
|
| 265 |
+
"fiscal_year_end": extract.fiscal_year_end
|
| 266 |
+
},
|
| 267 |
"extracted": extract.dict(),
|
| 268 |
"ratios": ratios,
|
| 269 |
"decisions": decisions,
|
| 270 |
"disclaimer": "本ツールはAIによる推定・一般的な計算式に基づく参考提案であり、投資勧誘・融資約定・与信保証を目的としたものではありません。最終判断は自己責任で、必要に応じて専門家の確認を行ってください。",
|
| 271 |
}
|
| 272 |
+
if unit_info:
|
| 273 |
+
out["unit_detection"] = unit_info
|
| 274 |
+
return out
|
llm_extract.py
CHANGED
|
@@ -43,9 +43,11 @@ def _strip_code_fences(s: str) -> str:
|
|
| 43 |
return s2.strip()
|
| 44 |
|
| 45 |
def _pdf_text_concat(paths: List[str], max_chars: int = 180_000) -> str:
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
|
|
|
|
|
|
| 49 |
for p in paths:
|
| 50 |
try:
|
| 51 |
r = PdfReader(p)
|
|
@@ -53,14 +55,14 @@ def _pdf_text_concat(paths: List[str], max_chars: int = 180_000) -> str:
|
|
| 53 |
t = page.extract_text() or ""
|
| 54 |
if t:
|
| 55 |
out.append(t)
|
| 56 |
-
|
| 57 |
-
|
|
|
|
| 58 |
except Exception:
|
| 59 |
-
# 1つ失敗しても続行
|
| 60 |
continue
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
return
|
| 64 |
|
| 65 |
def _json_loads_strict(raw: str) -> dict:
|
| 66 |
try:
|
|
@@ -75,7 +77,7 @@ def extract_financials_from_files(
|
|
| 75 |
currency_hint: Optional[str],
|
| 76 |
model: str = VISION_MODEL,
|
| 77 |
debug: bool = False,
|
| 78 |
-
local_paths: Optional[List[str]] = None, # ←
|
| 79 |
) -> FinancialExtract:
|
| 80 |
|
| 81 |
schema = FinancialExtract.model_json_schema()
|
|
@@ -93,7 +95,7 @@ def extract_financials_from_files(
|
|
| 93 |
if currency_hint:
|
| 94 |
base_user += f"\nCurrency hint: {currency_hint}"
|
| 95 |
|
| 96 |
-
# 1)
|
| 97 |
try:
|
| 98 |
resp = client.responses.create(
|
| 99 |
model=model,
|
|
@@ -106,7 +108,6 @@ def extract_financials_from_files(
|
|
| 106 |
],
|
| 107 |
},
|
| 108 |
],
|
| 109 |
-
# 一部ゲートウェイでは response_format 未対応のため入れない
|
| 110 |
max_output_tokens=2048,
|
| 111 |
)
|
| 112 |
raw = _safe_output_text(resp)
|
|
@@ -114,13 +115,12 @@ def extract_financials_from_files(
|
|
| 114 |
return FinancialExtract.model_validate(data)
|
| 115 |
|
| 116 |
except Exception as e_vision:
|
| 117 |
-
# 2)
|
| 118 |
if not local_paths:
|
| 119 |
raise RuntimeError(f"Vision抽出に失敗し、かつローカルPDFテキストがありません: {e_vision}")
|
| 120 |
|
| 121 |
text = _pdf_text_concat(local_paths)
|
| 122 |
if not text:
|
| 123 |
-
# 画像ベースPDFの可能性 → Visionエラー内容を返す
|
| 124 |
raise RuntimeError(f"PDFが画像ベースの可能性があり、テキスト抽出できません。Vision側エラー: {e_vision}")
|
| 125 |
|
| 126 |
user2 = (
|
|
|
|
| 43 |
return s2.strip()
|
| 44 |
|
| 45 |
def _pdf_text_concat(paths: List[str], max_chars: int = 180_000) -> str:
|
| 46 |
+
try:
|
| 47 |
+
from pypdf import PdfReader
|
| 48 |
+
except Exception:
|
| 49 |
+
return ""
|
| 50 |
+
out, total = [], 0
|
| 51 |
for p in paths:
|
| 52 |
try:
|
| 53 |
r = PdfReader(p)
|
|
|
|
| 55 |
t = page.extract_text() or ""
|
| 56 |
if t:
|
| 57 |
out.append(t)
|
| 58 |
+
total += len(t)
|
| 59 |
+
if total > max_chars:
|
| 60 |
+
break
|
| 61 |
except Exception:
|
|
|
|
| 62 |
continue
|
| 63 |
+
if total > max_chars:
|
| 64 |
+
break
|
| 65 |
+
return "\n\n".join(out)[:max_chars]
|
| 66 |
|
| 67 |
def _json_loads_strict(raw: str) -> dict:
|
| 68 |
try:
|
|
|
|
| 77 |
currency_hint: Optional[str],
|
| 78 |
model: str = VISION_MODEL,
|
| 79 |
debug: bool = False,
|
| 80 |
+
local_paths: Optional[List[str]] = None, # ← フォールバック用
|
| 81 |
) -> FinancialExtract:
|
| 82 |
|
| 83 |
schema = FinancialExtract.model_json_schema()
|
|
|
|
| 95 |
if currency_hint:
|
| 96 |
base_user += f"\nCurrency hint: {currency_hint}"
|
| 97 |
|
| 98 |
+
# 1) Vision + file_id で試す(response_format 未使用)
|
| 99 |
try:
|
| 100 |
resp = client.responses.create(
|
| 101 |
model=model,
|
|
|
|
| 108 |
],
|
| 109 |
},
|
| 110 |
],
|
|
|
|
| 111 |
max_output_tokens=2048,
|
| 112 |
)
|
| 113 |
raw = _safe_output_text(resp)
|
|
|
|
| 115 |
return FinancialExtract.model_validate(data)
|
| 116 |
|
| 117 |
except Exception as e_vision:
|
| 118 |
+
# 2) テキスト抽出 → TEXT_MODEL で構造化
|
| 119 |
if not local_paths:
|
| 120 |
raise RuntimeError(f"Vision抽出に失敗し、かつローカルPDFテキストがありません: {e_vision}")
|
| 121 |
|
| 122 |
text = _pdf_text_concat(local_paths)
|
| 123 |
if not text:
|
|
|
|
| 124 |
raise RuntimeError(f"PDFが画像ベースの可能性があり、テキスト抽出できません。Vision側エラー: {e_vision}")
|
| 125 |
|
| 126 |
user2 = (
|
requirements.txt
CHANGED
|
@@ -5,4 +5,3 @@ pyyaml>=6.0.1
|
|
| 5 |
numpy>=1.26.4
|
| 6 |
pandas>=2.2.2
|
| 7 |
pypdf>=4.2.0
|
| 8 |
-
|
|
|
|
| 5 |
numpy>=1.26.4
|
| 6 |
pandas>=2.2.2
|
| 7 |
pypdf>=4.2.0
|
|
|