Spaces:

Corin1998
/

Score

Sleeping

App Files Files Community

Corin1998 commited on Aug 28, 2025

Commit

c37aeda

verified ·

1 Parent(s): 876e0a2

Update core/extract.py

Browse files

Files changed (1) hide show

core/extract.py +166 -62

core/extract.py CHANGED Viewed

@@ -1,61 +1,78 @@
 # core/extract.py
 from __future__ import annotations
-import os, io, base64, json, shutil
 from typing import List, Dict, Any, Tuple
-import pdfplumber
 from pdf2image import convert_from_path
 from openai import OpenAI
-from core.unit_utils import detect_unit_scale, apply_unit_scale
 OPENAI_MODEL_VISION = os.environ.get("OPENAI_VISION_MODEL", "gpt-4o-mini")
 OPENAI_MODEL_TEXT   = os.environ.get("OPENAI_TEXT_MODEL",   "gpt-4o-mini")
-SYSTEM_JSON = """あなたは有能な財務アナリストです。
-与えられた決算書（画像またはテキスト）から、次の厳密な JSON 構造のみを日本語の単位なし・半角数値で返してください。分からない項目は null。
-{
-  "company": {"name": null},
-  "period": {"start_date": null, "end_date": null},
-  "balance_sheet": {
-    "total_assets": null, "total_liabilities": null, "total_equity": null,
-    "current_assets": null, "fixed_assets": null,
-    "current_liabilities": null, "long_term_liabilities": null
-  },
-  "income_statement": {
-    "sales": null, "cost_of_sales": null, "gross_profit": null,
-    "operating_expenses": null, "operating_income": null,
-    "ordinary_income": null, "net_income": null
-  },
-  "cash_flows": {
-    "operating_cash_flow": null, "investing_cash_flow": null, "financing_cash_flow": null
-  }
-}
-"""
 def _b64(img: bytes) -> str:
-    import base64
     return base64.b64encode(img).decode("utf-8")
 def _client() -> OpenAI:
     key = os.environ.get("OPENAI_API_KEY")
     if not key:
-        raise RuntimeError("OPENAI_API_KEY が未設定です。Spaces → Settings → Variables and secrets に設定してください。")
-    # proxies は渡さない（httpx==0.27系で互換）
-    return OpenAI(api_key=key, timeout=30)
 def pdf_to_images(pdf_path: str, dpi: int = 220, max_pages: int = 6) -> List[bytes]:
-    pages = convert_from_path(pdf_path, dpi=dpi, fmt="png")
     out: List[bytes] = []
-    for i, p in enumerate(pages):
         if i >= max_pages:
             break
         buf = io.BytesIO()
-        p.save(buf, format="PNG")
         out.append(buf.getvalue())
     return out
-def pdf_to_text(pdf_path: str, max_chars: int = 16000) -> str:
     chunks: List[str] = []
     with pdfplumber.open(pdf_path) as pdf:
         for i, page in enumerate(pdf.pages):
@@ -66,13 +83,87 @@ def pdf_to_text(pdf_path: str, max_chars: int = 16000) -> str:
                 break
     return "\n\n".join(chunks)[:max_chars]
-def _extract_with_vision(images: List[bytes], company_hint: str) -> Dict[str, Any]:
     client = _client()
     content = [{"type": "text", "text": SYSTEM_JSON}]
     if company_hint:
         content.append({"type": "text", "text": f"会社名の候補: {company_hint}"})
     for im in images:
         content.append({"type": "input_image", "image_url": f"data:image/png;base64,{_b64(im)}"})
     resp = client.chat.completions.create(
         model=OPENAI_MODEL_VISION,
         messages=[
@@ -84,9 +175,9 @@ def _extract_with_vision(images: List[bytes], company_hint: str) -> Dict[str, An
     )
     return json.loads(resp.choices[0].message.content)
-def _extract_with_text(text_blob: str, company_hint: str) -> Dict[str, Any]:
     client = _client()
-    prompt = f"{SYSTEM_JSON}\n\n以下は決算書のテキストです。上記の JSON だけを返してください。\n\n{text_blob or ''}"
     resp = client.chat.completions.create(
         model=OPENAI_MODEL_TEXT,
         messages=[
@@ -98,8 +189,9 @@ def _extract_with_text(text_blob: str, company_hint: str) -> Dict[str, Any]:
     )
     return json.loads(resp.choices[0].message.content)
-def fin_to_df(fin: Dict[str, Any]):
-    import pandas as pd
     rows = []
     def add(cat, d):
         for k, v in (d or {}).items():
@@ -109,29 +201,48 @@ def fin_to_df(fin: Dict[str, Any]):
     add("cash_flows", fin.get("cash_flows"))
     return pd.DataFrame(rows, columns=["category", "item", "value"])
-def parse_pdf(files: List[str], company: str = "", use_vision: bool = True) -> Tuple[Dict[str,Any], "pd.DataFrame", Dict[str,Any], str]:
     """
     返り値: (fin_scaled, df_scaled, meta, log)
     meta: {"unit_label","unit_scale","unit_hits":[...],"warnings":[...]}
     """
     logs = []
-    if not files:
         raise RuntimeError("PDF をアップロードしてください。")
     # 1) テキスト連結（単位推定の根拠に使用）
     all_text = ""
-    for p in files:
         t = pdf_to_text(p)
         all_text += ("\n\n" + t) if all_text else t
-    scale, label, hits = detect_unit_scale(all_text)
-    logs.append(f"[unit] 推定: {label} (×{scale:,.0f}) | hits={hits[:5]}{'...' if len(hits)>5 else ''}")
-    # 2) まず Vision を試し、失敗したら Text にフォールバック
     fin_raw: Dict[str, Any]
     if use_vision:
         try:
             all_images: List[bytes] = []
-            for p in files:
                 all_images += pdf_to_images(p, dpi=220, max_pages=6)
             fin_raw = _extract_with_vision(all_images, company)
             logs.append("[extract] Vision 解析に成功")
@@ -141,25 +252,18 @@ def parse_pdf(files: List[str], company: str = "", use_vision: bool = True) -> T
     else:
         fin_raw = _extract_with_text(all_text, company)
-    # 3) 単位換算を適用
-    fin_scaled = apply_unit_scale(fin_raw, scale)
-    # 一貫性チェック（総資産 ≈ 負債 + 純資産）
-    warn = []
-    try:
-        ta = float(fin_scaled.get("balance_sheet",{}).get("total_assets") or 0)
-        tl = float(fin_scaled.get("balance_sheet",{}).get("total_liabilities") or 0)
-        te = float(fin_scaled.get("balance_sheet",{}).get("total_equity") or 0)
-        if ta and (abs(ta - (tl + te)) / max(ta,1)) > 0.05:
-            warn.append("B/S 整合性に差分 >5%（単位や抽出精度を確認）")
-    except Exception:
-        pass
     meta = {
-        "unit_label": label,
-        "unit_scale": scale,
-        "unit_hits": hits,
-        "warnings": warn,
     }
-    df = fin_to_df(fin_scaled)
-    return fin_scaled, df, meta, "\n".join(logs + [f"[warn] {w}" for w in warn])

 # core/extract.py
 from __future__ import annotations
+import os, io, re, json, base64, shutil
 from typing import List, Dict, Any, Tuple
+import pandas as pd
 from pdf2image import convert_from_path
+import pdfplumber
+# OpenAI SDK v1系を想定（requirements側で httpx==0.27.2 を厳格指定してください）
 from openai import OpenAI
 OPENAI_MODEL_VISION = os.environ.get("OPENAI_VISION_MODEL", "gpt-4o-mini")
 OPENAI_MODEL_TEXT   = os.environ.get("OPENAI_TEXT_MODEL",   "gpt-4o-mini")
+# ---------- 内部ユーティリティ ----------
 def _b64(img: bytes) -> str:
     return base64.b64encode(img).decode("utf-8")
 def _client() -> OpenAI:
+    # httpxバージョンの相性チェック（0.28系だとproxies引数でコケる）
+    try:
+        import httpx
+        if not httpx.__version__.startswith("0.27."):
+            raise RuntimeError(
+                f"httpx==0.27.x を利用してください（現在: {httpx.__version__}）。"
+                " requirements.txt に `httpx==0.27.2` を明記。"
+            )
+    except Exception as e:
+        # ここで例外にしてUIに表示する（診断しやすくする）
+        raise e
     key = os.environ.get("OPENAI_API_KEY")
     if not key:
+        raise RuntimeError("OPENAI_API_KEY が未設定です。Spaces の Secrets に追加してください。")
+    return OpenAI(api_key=key, timeout=60)
+def _coerce_filepaths(files) -> List[str]:
+    """Gradioから渡るfilesを確実にパス配列へ正規化"""
+    paths: List[str] = []
+    if not files:
+        return []
+    if isinstance(files, str):
+        return [files] if files.lower().endswith(".pdf") and os.path.exists(files) else []
+    for f in files:
+        if isinstance(f, str):
+            p = f
+        elif isinstance(f, dict) and "name" in f:
+            p = f["name"]
+        elif hasattr(f, "name"):
+            p = getattr(f, "name")
+        elif isinstance(f, tuple) and f and isinstance(f[0], str):
+            p = f[0]
+        else:
+            p = None
+        if p and p.lower().endswith(".pdf") and os.path.exists(p):
+            paths.append(p)
+    return paths
+# ---------- PDF -> 画像 / テキスト ----------
 def pdf_to_images(pdf_path: str, dpi: int = 220, max_pages: int = 6) -> List[bytes]:
+    images = convert_from_path(pdf_path, dpi=dpi, fmt="png")
     out: List[bytes] = []
+    for i, im in enumerate(images):
         if i >= max_pages:
             break
         buf = io.BytesIO()
+        im.save(buf, format="PNG")
         out.append(buf.getvalue())
     return out
+def pdf_to_text(pdf_path: str, max_chars: int = 15000) -> str:
     chunks: List[str] = []
     with pdfplumber.open(pdf_path) as pdf:
         for i, page in enumerate(pdf.pages):
                 break
     return "\n\n".join(chunks)[:max_chars]
+# ---------- 単位推定 ----------
+_UNIT_MAP = {
+    "円": 1,
+    "千円": 1_000,
+    "万円": 10_000,
+    "百万円": 1_000_000,
+    "million yen": 1_000_000,
+    "thousand yen": 1_000,
+    "yen": 1,
+}
+_UNIT_PATTERNS = [
+    r"単位\s*[:：]?\s*(百万円|千円|万円|円)",
+    r"単位\s*[（(]\s*(百万円|千円|万円|円)\s*[)）]",
+    r"(unit|units)\s*[:：]?\s*(million yen|thousand yen|yen)",
+]
+def detect_unit(text: str) -> Tuple[str, int, list[str]]:
+    """
+    PDFテキストから単位を推定。最頻ヒットを採用���無ければデフォルト百万円。
+    戻り値: (label, scale, hits[])
+    """
+    hits: list[str] = []
+    for pat in _UNIT_PATTERNS:
+        for m in re.finditer(pat, text, flags=re.I):
+            g = m.group(1).lower()
+            # 日本語はそのまま、英語は小文字のまま map
+            if g in ["百万円","千円","万円","円"]:
+                hits.append(g)
+            elif g in ["million yen","thousand yen","yen"]:
+                hits.append(g)
+    if hits:
+        # 最頻値
+        from collections import Counter
+        label = Counter(hits).most_common(1)[0][0]
+        # 表示は日本語優先
+        disp = {"million yen":"百万円","thousand yen":"千円","yen":"円"}.get(label, label)
+        scale = _UNIT_MAP[label]
+        return disp, scale, hits
+    # 「千円未満切捨て」などの補助ヒント
+    if re.search(r"千円.*切[捨下]", text):
+        return "千円", 1_000, ["補助ヒント: 千円未満切捨て"]
+    if re.search(r"百万円.*切[捨下]", text):
+        return "百万円", 1_000_000, ["補助ヒント: 百万円切捨て"]
+    # 何も見つからなければ百万円を既定
+    return "百万円", 1_000_000, []
+# ---------- OpenAI で表読み取り ----------
+SYSTEM_JSON = """あなたは有能な財務アナリストです。
+与えられた決算書（画像またはテキスト）から、次の厳密な JSON 構造のみを日本語の単位なし・半角数値で返してください。分からない項目は null。
+{
+  "company": {"name": null},
+  "period": {"start_date": null, "end_date": null},
+  "balance_sheet": {
+    "total_assets": null, "total_liabilities": null, "total_equity": null,
+    "current_assets": null, "fixed_assets": null,
+    "current_liabilities": null, "long_term_liabilities": null
+  },
+  "income_statement": {
+    "sales": null, "cost_of_sales": null, "gross_profit": null,
+    "operating_expenses": null, "operating_income": null,
+    "ordinary_income": null, "net_income": null
+  },
+  "cash_flows": {
+    "operating_cash_flow": null, "investing_cash_flow": null, "financing_cash_flow": null
+  }
+}
+"""
+def _extract_with_vision(images: List[bytes], company_hint: str = "") -> Dict[str, Any]:
     client = _client()
     content = [{"type": "text", "text": SYSTEM_JSON}]
     if company_hint:
         content.append({"type": "text", "text": f"会社名の候補: {company_hint}"})
     for im in images:
         content.append({"type": "input_image", "image_url": f"data:image/png;base64,{_b64(im)}"})
     resp = client.chat.completions.create(
         model=OPENAI_MODEL_VISION,
         messages=[
     )
     return json.loads(resp.choices[0].message.content)
+def _extract_with_text(text: str, company_hint: str = "") -> Dict[str, Any]:
     client = _client()
+    prompt = f"{SYSTEM_JSON}\n\n以下は決算書のテキストです。上記の JSON だけを返してください。\n\n{text or ''}"
     resp = client.chat.completions.create(
         model=OPENAI_MODEL_TEXT,
         messages=[
     )
     return json.loads(resp.choices[0].message.content)
+# ---------- JSON<->DataFrame 変換とスケーリング ----------
+def fin_to_df(fin: Dict[str, Any]) -> pd.DataFrame:
     rows = []
     def add(cat, d):
         for k, v in (d or {}).items():
     add("cash_flows", fin.get("cash_flows"))
     return pd.DataFrame(rows, columns=["category", "item", "value"])
+def _scale_fin(fin: Dict[str, Any], scale: float) -> Dict[str, Any]:
+    def sc_val(v):
+        if v in (None, "", "null"):
+            return None
+        try:
+            return float(v) * scale
+        except Exception:
+            return None
+    out = json.loads(json.dumps(fin))  # shallow copy
+    for sec in ("balance_sheet", "income_statement", "cash_flows"):
+        if sec in out and isinstance(out[sec], dict):
+            for k, v in out[sec].items():
+                out[sec][k] = sc_val(v)
+    return out
+# ---------- 入口：PDF解析 ----------
+def parse_pdf(files, company: str = "", use_vision: bool = True) -> Tuple[Dict[str,Any], "pd.DataFrame", Dict[str,Any], str]:
     """
     返り値: (fin_scaled, df_scaled, meta, log)
     meta: {"unit_label","unit_scale","unit_hits":[...],"warnings":[...]}
     """
     logs = []
+    paths = _coerce_filepaths(files)
+    if not paths:
         raise RuntimeError("PDF をアップロードしてください。")
     # 1) テキスト連結（単位推定の根拠に使用）
     all_text = ""
+    for p in paths:
         t = pdf_to_text(p)
         all_text += ("\n\n" + t) if all_text else t
+    unit_label, unit_scale, unit_hits = detect_unit(all_text)
+    logs.append(f"[unit] 推定: {unit_label} (×{unit_scale:,}) / hits: {unit_hits[:5]}{'...' if len(unit_hits)>5 else ''}")
+    # 2) 画像化 + Vision → ダメならテキストへ
     fin_raw: Dict[str, Any]
     if use_vision:
         try:
             all_images: List[bytes] = []
+            for p in paths:
                 all_images += pdf_to_images(p, dpi=220, max_pages=6)
             fin_raw = _extract_with_vision(all_images, company)
             logs.append("[extract] Vision 解析に成功")
     else:
         fin_raw = _extract_with_text(all_text, company)
+    # 3) 単位スケーリング
+    fin_scaled = _scale_fin(fin_raw, unit_scale)
+    df_scaled = fin_to_df(fin_scaled)
+    # 4) メタ情報
     meta = {
+        "unit_label": unit_label,
+        "unit_scale": unit_scale,
+        "unit_hits": unit_hits,
+        "warnings": [],
     }
+    # 5) ログ
+    log = "\n".join(logs)
+    return fin_scaled, df_scaled, meta, log