Spaces:

Corin1998
/

judging

Sleeping

App Files Files Community

Corin1998 commited on Aug 29, 2025

Commit

8752b28

verified ·

1 Parent(s): ca7a26c

Upload 7 files

Browse files

Files changed (4) hide show

app.py +102 -9
finance_core.py +16 -3
llm_extract.py +14 -14
requirements.txt +0 -1

app.py CHANGED Viewed

@@ -1,9 +1,10 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 import os
 import json
 from datetime import datetime
-from typing import List, Optional, Dict, Any
 import gradio as gr
 import yaml
@@ -67,7 +68,7 @@ growth:
 POLICIES = _load_policies()
 def _read_file_input(f):
     """Return (filename, bytes) from various Gradio file input shapes."""
     # Path-like / NamedString (File component with type="filepath")
@@ -106,6 +107,85 @@ def _read_file_input(f):
         pass
     raise ValueError(f"Unsupported file input type: {type(f)}")
 def analyze(
     files: List,
@@ -127,7 +207,7 @@ def analyze(
     if not files or len(files) == 0:
         raise gr.Error("決算書ファイル（PDF/画像）を1つ以上アップロードしてください。")
-    # 1) Upload files to OpenAI and extract structured financials via vision + Structured Outputs
     try:
         file_ids = []
         for f in files:
@@ -136,7 +216,7 @@ def analyze(
     except Exception as e:
         raise gr.Error(f"ファイルのアップロードに失敗しました: {e}")
-    # Local paths for text fallback (if available)
     local_paths = []
     for f in files:
         if isinstance(f, (str, bytes)) or hasattr(f, "__fspath__"):
@@ -172,6 +252,18 @@ def analyze(
     if industry_hint:
         extract.industry = industry_hint
     # 2) Compute derived ratios and risk score
     ratios = compute_ratios(extract)
@@ -196,12 +288,11 @@ def analyze(
         decisions["investment"] = investment_decision(extract, ratios, POLICIES, multiples)
     # 4) Build a combined report (dict) and return displays
-    report = build_report_dict(extract, ratios, decisions)
     report_json = json.dumps(report, ensure_ascii=False, indent=2)
-    # Save a downloadable JSON (ensure directory exists)
     ts = datetime.utcnow().strftime("%Y%m%d-%H%M%S")
-    # ✅ Gradio v5 の仕様に合わせ、既定保存先は /tmp に
     data_dir = os.environ.get("HF_DATA_DIR", "/tmp")
     os.makedirs(data_dir, exist_ok=True)
     out_path = os.path.join(data_dir, f"report-{ts}.json")
@@ -218,6 +309,9 @@ def analyze(
     if extract.fiscal_year_end:
         summary_md.append(f"### 決算期末\n{extract.fiscal_year_end}")
     summary_md.append("### 指標（主要）")
     summary_md.append(
         f"- 売上高: {ratios.get('revenue')}\n"
@@ -299,5 +393,4 @@ def build_ui():
 if __name__ == "__main__":
     demo = build_ui()
-    # ✅ /tmp と /mnt/data の両方を（必要なら）許可。/tmp が既定保存先なのでこのままでOK。
-    demo.launch(allowed_paths=["/tmp", "/mnt/data"])

 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 import os
+import re
 import json
 from datetime import datetime
+from typing import List, Optional, Dict, Any, Tuple
 import gradio as gr
 import yaml
 POLICIES = _load_policies()
+# --- Gradio ファイル入力の型差異を吸収 ---
 def _read_file_input(f):
     """Return (filename, bytes) from various Gradio file input shapes."""
     # Path-like / NamedString (File component with type="filepath")
         pass
     raise ValueError(f"Unsupported file input type: {type(f)}")
+# --- 単位検出＆換算ヘルパー（PDF本文を走査して「単位：百万円」等を検出） ---
+def _concat_pdf_text(paths: List[str], max_chars: int = 180_000) -> str:
+    try:
+        from pypdf import PdfReader
+    except Exception:
+        return ""
+    out = []
+    total = 0
+    for p in paths:
+        try:
+            r = PdfReader(p)
+            for page in r.pages:
+                t = page.extract_text() or ""
+                if t:
+                    out.append(t)
+                    total += len(t)
+                    if total > max_chars:
+                        break
+        except Exception:
+            continue
+        if total > max_chars:
+            break
+    return "\n\n".join(out)[:max_chars]
+def detect_unit_multiplier_from_paths(paths: List[str]) -> Tuple[float, str]:
+    """
+    PDF本文から単位を推定して (乗数, ラベル) を返す。
+    例: ('百万円'→1_000_000, '千円'→1_000, '万円'→10_000, '円'→1,
+         'millions'→1_000_000, 'thousands'→1_000)
+    見つからなければ (1, '不明')
+    """
+    text = _concat_pdf_text(paths)
+    if not text:
+        return 1.0, "不明"
+    lower = text.lower()
+    # 日本語パターン（優先度：百万円→千円→万円→円）
+    if re.search(r"単位[：:]\s*百万円", text) or re.search(r"（百万円）", text):
+        return 1_000_000.0, "百万円"
+    if re.search(r"単位[：:]\s*千円", text) or re.search(r"（千円）", text):
+        return 1_000.0, "千円"
+    if re.search(r"単位[：:]\s*万円", text) or re.search(r"（万円）", text):
+        return 10_000.0, "万円"
+    if re.search(r"単位[：:]\s*円", text) or re.search(r"（円）", text):
+        return 1.0, "円"
+    # 英語パターン
+    if re.search(r"in\s+millions\s+of\s+(yen|jpy|usd|dollars?)", lower) or re.search(r"\b(jpy|¥|\$|usd)\s*\(\s*millions?\s*\)", lower):
+        return 1_000_000.0, "millions"
+    if re.search(r"in\s+thousands\s+of\s+(yen|jpy|usd|dollars?)", lower) or re.search(r"\b(jpy|¥|\$|usd)\s*\(\s*thousands?\s*\)", lower):
+        return 1_000.0, "thousands"
+    # コンテキストで単独出現
+    if re.search(r"百万円", text):
+        return 1_000_000.0, "百万円"
+    return 1.0, "不明"
+_NUM_FIELDS = [
+    "revenue","cogs","ebit","depreciation","ebitda","net_income",
+    "cash_and_equivalents","accounts_receivable","inventory","accounts_payable",
+    "current_assets","current_liabilities","total_assets","total_equity",
+    "total_debt","interest_expense",
+]
+def scale_extract_inplace(extract: FinancialExtract, multiplier: float) -> None:
+    """抽出済みオブジェクトの数値を指定乗数でインプレース換算する（Noneは無視）。"""
+    if not multiplier or multiplier == 1:
+        return
+    for period in extract.periods:
+        for k in _NUM_FIELDS:
+            v = getattr(period, k)
+            if v is not None:
+                try:
+                    setattr(period, k, float(v) * float(multiplier))
+                except Exception:
+                    pass
 def analyze(
     files: List,
     if not files or len(files) == 0:
         raise gr.Error("決算書ファイル（PDF/画像）を1つ以上アップロードしてください。")
+    # 1) Upload files to OpenAI and extract structured financials via vision
     try:
         file_ids = []
         for f in files:
     except Exception as e:
         raise gr.Error(f"ファイルのアップロードに失敗しました: {e}")
+    # Local paths for text & unit fallback
     local_paths = []
     for f in files:
         if isinstance(f, (str, bytes)) or hasattr(f, "__fspath__"):
     if industry_hint:
         extract.industry = industry_hint
+    # --- 単位検出＆換算（円/ドル等の素単位に正規化）---
+    unit_info = {"source_label": "不明", "multiplier": 1}
+    try:
+        if local_paths:
+            mult, label = detect_unit_multiplier_from_paths(local_paths)
+            unit_info = {"source_label": label, "multiplier": int(mult)}
+            if mult and mult != 1:
+                scale_extract_inplace(extract, mult)
+    except Exception as e:
+        if debug:
+            print(f"[unit-detect] warning: {e}")
     # 2) Compute derived ratios and risk score
     ratios = compute_ratios(extract)
         decisions["investment"] = investment_decision(extract, ratios, POLICIES, multiples)
     # 4) Build a combined report (dict) and return displays
+    report = build_report_dict(extract, ratios, decisions, unit_info=unit_info)
     report_json = json.dumps(report, ensure_ascii=False, indent=2)
+    # Save a downloadable JSON (ensure directory exists; Gradio v5 は /tmp を推奨)
     ts = datetime.utcnow().strftime("%Y%m%d-%H%M%S")
     data_dir = os.environ.get("HF_DATA_DIR", "/tmp")
     os.makedirs(data_dir, exist_ok=True)
     out_path = os.path.join(data_dir, f"report-{ts}.json")
     if extract.fiscal_year_end:
         summary_md.append(f"### 決算期末\n{extract.fiscal_year_end}")
+    summary_md.append("### 単位（検出結果）")
+    summary_md.append(f"- ソース表記: {unit_info['source_label']} / 乗数: x{unit_info['multiplier']:,}" + ("（数値は換算済み）" if unit_info["multiplier"] != 1 else ""))
     summary_md.append("### 指標（主要）")
     summary_md.append(
         f"- 売上高: {ratios.get('revenue')}\n"
 if __name__ == "__main__":
     demo = build_ui()
+    demo.launch(allowed_paths=["/tmp", "/mnt/data"])  # /tmp を既定保存先にしつつ、必要なら /mnt/data も許可

finance_core.py CHANGED Viewed

@@ -251,11 +251,24 @@ def investment_decision(extract: FinancialExtract, ratios: Dict[str, Any], polic
             "recommended_check_size": check, "recommended_check_size_display": _fmt_currency(check, currency),
             "attractiveness": attractiveness, "growth_label": glabel}
-def build_report_dict(extract: FinancialExtract, ratios: Dict[str, Any], decisions: Dict[str, Any]) -> Dict[str, Any]:
-    return {
-        "metadata": {"company_name": extract.company_name, "industry": extract.industry, "currency": extract.currency, "fiscal_year_end": extract.fiscal_year_end},
         "extracted": extract.dict(),
         "ratios": ratios,
         "decisions": decisions,
         "disclaimer": "本ツールはAIによる推定・一般的な計算式に基づく参考提案であり、投資勧誘・融資約定・与信保証を目的としたものではありません。最終判断は自己責任で、必要に応じて専門家の確認を行ってください。",
     }

             "recommended_check_size": check, "recommended_check_size_display": _fmt_currency(check, currency),
             "attractiveness": attractiveness, "growth_label": glabel}
+def build_report_dict(
+    extract: FinancialExtract,
+    ratios: Dict[str, Any],
+    decisions: Dict[str, Any],
+    unit_info: Optional[Dict[str, Any]] = None,   # ← 追加
+) -> Dict[str, Any]:
+    out = {
+        "metadata": {
+            "company_name": extract.company_name,
+            "industry": extract.industry,
+            "currency": extract.currency,
+            "fiscal_year_end": extract.fiscal_year_end
+        },
         "extracted": extract.dict(),
         "ratios": ratios,
         "decisions": decisions,
         "disclaimer": "本ツールはAIによる推定・一般的な計算式に基づく参考提案であり、投資勧誘・融資約定・与信保証を目的としたものではありません。最終判断は自己責任で、必要に応じて専門家の確認を行ってください。",
     }
+    if unit_info:
+        out["unit_detection"] = unit_info
+    return out

llm_extract.py CHANGED Viewed

@@ -43,9 +43,11 @@ def _strip_code_fences(s: str) -> str:
     return s2.strip()
 def _pdf_text_concat(paths: List[str], max_chars: int = 180_000) -> str:
-    """pypdf でテキスト抽出（画像PDFは空になることあり）。"""
-    from pypdf import PdfReader
-    out = []
     for p in paths:
         try:
             r = PdfReader(p)
@@ -53,14 +55,14 @@ def _pdf_text_concat(paths: List[str], max_chars: int = 180_000) -> str:
                 t = page.extract_text() or ""
                 if t:
                     out.append(t)
-                if sum(len(x) for x in out) > max_chars:
-                    break
         except Exception:
-            # 1つ失敗しても続行
             continue
-    text = "\n\n".join(out)
-    # 長すぎる場合は切り詰め
-    return text[:max_chars]
 def _json_loads_strict(raw: str) -> dict:
     try:
@@ -75,7 +77,7 @@ def extract_financials_from_files(
     currency_hint: Optional[str],
     model: str = VISION_MODEL,
     debug: bool = False,
-    local_paths: Optional[List[str]] = None,  # ← 追加
 ) -> FinancialExtract:
     schema = FinancialExtract.model_json_schema()
@@ -93,7 +95,7 @@ def extract_financials_from_files(
     if currency_hint:
         base_user += f"\nCurrency hint: {currency_hint}"
-    # 1) まずは Vision + file_id で試す
     try:
         resp = client.responses.create(
             model=model,
@@ -106,7 +108,6 @@ def extract_financials_from_files(
                     ],
                 },
             ],
-            # 一部ゲートウェイでは response_format 未対応のため入れない
             max_output_tokens=2048,
         )
         raw = _safe_output_text(resp)
@@ -114,13 +115,12 @@ def extract_financials_from_files(
         return FinancialExtract.model_validate(data)
     except Exception as e_vision:
-        # 2) フォールバック：テキスト抽出 → TEXT_MODEL で構造化
         if not local_paths:
             raise RuntimeError(f"Vision抽出に失敗し、かつローカルPDFテキストがありません: {e_vision}")
         text = _pdf_text_concat(local_paths)
         if not text:
-            # 画像ベースPDFの可能性 → Visionエラー内容を返す
             raise RuntimeError(f"PDFが画像ベースの可能性があり、テキスト抽出できません。Vision側エラー: {e_vision}")
         user2 = (

     return s2.strip()
 def _pdf_text_concat(paths: List[str], max_chars: int = 180_000) -> str:
+    try:
+        from pypdf import PdfReader
+    except Exception:
+        return ""
+    out, total = [], 0
     for p in paths:
         try:
             r = PdfReader(p)
                 t = page.extract_text() or ""
                 if t:
                     out.append(t)
+                    total += len(t)
+                    if total > max_chars:
+                        break
         except Exception:
             continue
+        if total > max_chars:
+            break
+    return "\n\n".join(out)[:max_chars]
 def _json_loads_strict(raw: str) -> dict:
     try:
     currency_hint: Optional[str],
     model: str = VISION_MODEL,
     debug: bool = False,
+    local_paths: Optional[List[str]] = None,  # ← フォールバック用
 ) -> FinancialExtract:
     schema = FinancialExtract.model_json_schema()
     if currency_hint:
         base_user += f"\nCurrency hint: {currency_hint}"
+    # 1) Vision + file_id で試す（response_format 未使用）
     try:
         resp = client.responses.create(
             model=model,
                     ],
                 },
             ],
             max_output_tokens=2048,
         )
         raw = _safe_output_text(resp)
         return FinancialExtract.model_validate(data)
     except Exception as e_vision:
+        # 2) テキスト抽出 → TEXT_MODEL で構造化
         if not local_paths:
             raise RuntimeError(f"Vision抽出に失敗し、かつローカルPDFテキストがありません: {e_vision}")
         text = _pdf_text_concat(local_paths)
         if not text:
             raise RuntimeError(f"PDFが画像ベースの可能性があり、テキスト抽出できません。Vision側エラー: {e_vision}")
         user2 = (

requirements.txt CHANGED Viewed

@@ -5,4 +5,3 @@ pyyaml>=6.0.1
 numpy>=1.26.4
 pandas>=2.2.2
 pypdf>=4.2.0

 numpy>=1.26.4
 pandas>=2.2.2
 pypdf>=4.2.0