Spaces:

Corin1998
/

Score

Sleeping

App Files Files Community

Corin1998 commited on Aug 28, 2025

Commit

c13dc84

verified ·

1 Parent(s): 65060db

Create extract.py

Browse files

Files changed (1) hide show

core/extract.py +81 -163

core/extract.py CHANGED Viewed

@@ -1,178 +1,96 @@
 # core/extract.py
 from __future__ import annotations
-import os, io, base64, json, shutil
-from typing import List, Tuple, Dict, Any
-import pandas as pd
 import pdfplumber
 from pdf2image import convert_from_path
-from openai import OpenAI
-# ==== モデル指定（環境変数で変更可） ====
-OPENAI_MODEL_VISION = os.environ.get("OPENAI_VISION_MODEL", "gpt-4o-mini")
-OPENAI_MODEL_TEXT   = os.environ.get("OPENAI_TEXT_MODEL",   "gpt-4o-mini")
-# ==== 共通ユーティリティ ====
-def _b64(b: bytes) -> str:
-    return base64.b64encode(b).decode("utf-8")
-def _client() -> OpenAI:
-    """
-    OpenAI公式 SDK v1 系。httpx との互換のため requirements は httpx==0.27.* を推奨。
-    """
-    key = os.environ.get("OPENAI_API_KEY")
-    if not key:
-        raise RuntimeError("OPENAI_API_KEY が未設定です（Spaces → Settings → Variables and secrets）。")
-    # proxies は渡さない（互換性エラーを避ける）
-    return OpenAI(api_key=key, timeout=30)
-# ==== PDF 読み込み ====
-def _pdf_to_images(path: str, dpi: int = 220, max_pages: int = 6) -> List[bytes]:
     """
-    Poppler系バイナリ（pdftoppm/pdftocairo）が必要です。Spaces なら packages.txt に
-    `poppler-utils` を入れておくと安定します。
     """
-    imgs: List[bytes] = []
-    pages = convert_from_path(path, dpi=dpi, fmt="png")
-    for i, p in enumerate(pages):
-        if i >= max_pages:
-            break
-        buf = io.BytesIO()
-        p.save(buf, format="PNG")
-        imgs.append(buf.getvalue())
-    return imgs
-def _pdf_to_text(path: str, max_chars: int = 15000) -> str:
-    out: List[str] = []
-    with pdfplumber.open(path) as pdf:
-        for i, page in enumerate(pdf.pages):
-            t = (page.extract_text() or "").strip()
-            if t:
-                out.append(f"[page {i+1}]\n{t}")
-            if sum(len(x) for x in out) > max_chars:
-                break
-    return "\n\n".join(out)[:max_chars]
-# ==== LLM へ渡す JSON 指示 ====
-_SYSTEM_JSON = """あなたは有能な財務アナリストです。
-与えられた決算書（画像またはテキスト）から、次の厳密な JSON 構造のみを日本語の単位なし・半角数値で返してください。分からない項目は null。
-{
-  "company": {"name": null},
-  "period": {"start_date": null, "end_date": null},
-  "balance_sheet": {
-    "total_assets": null, "total_liabilities": null, "total_equity": null,
-    "current_assets": null, "fixed_assets": null,
-    "current_liabilities": null, "long_term_liabilities": null
-  },
-  "income_statement": {
-    "sales": null, "cost_of_sales": null, "gross_profit": null,
-    "operating_expenses": null, "operating_income": null,
-    "ordinary_income": null, "net_income": null
-  },
-  "cash_flows": {
-    "operating_cash_flow": null, "investing_cash_flow": null, "financing_cash_flow": null
-  }
-}
-"""
-def _extract_with_llm(images: List[bytes] | None, text_blob: str | None, company_hint: str) -> Dict[str, Any]:
-    client = _client()
-    if images:
-        content = [{"type": "text", "text": _SYSTEM_JSON}]
-        if company_hint:
-            content.append({"type": "text", "text": f"会社名の候補: {company_hint}"})
-        for im in images:
-            content.append({"type": "input_image", "image_url": f"data:image/png;base64,{_b64(im)}"})
-        resp = client.chat.completions.create(
-            model=OPENAI_MODEL_VISION,
-            messages=[
-                {"role": "system", "content": "返答は必ず有効な JSON オブジェクトのみ。説明を含めない。"},
-                {"role": "user", "content": content},
-            ],
-            response_format={"type": "json_object"},
-            temperature=0.1,
-        )
-        return json.loads(resp.choices[0].message.content)
-    else:
-        prompt = f"{_SYSTEM_JSON}\n\n以下は決算書のテキストです。上記の JSON だけを返してください。\n\n{text_blob or ''}"
-        resp = client.chat.completions.create(
-            model=OPENAI_MODEL_TEXT,
-            messages=[
-                {"role": "system", "content": "返答は必ず有効な JSON オブジェクトのみ。"},
-                {"role": "user", "content": prompt},
-            ],
-            response_format={"type": "json_object"},
-            temperature=0.1,
-        )
-        return json.loads(resp.choices[0].message.content)
-# ==== JSON <-> DataFrame ====
-def _fin_to_df(fin: Dict[str, Any]) -> pd.DataFrame:
-    rows: List[Dict[str, Any]] = []
-    def add(cat: str, d: Dict[str, Any] | None):
-        for k, v in (d or {}).items():
-            rows.append({"category": cat, "item": k, "value": v})
-    add("balance_sheet", fin.get("balance_sheet"))
-    add("income_statement", fin.get("income_statement"))
-    add("cash_flows", fin.get("cash_flows"))
-    return pd.DataFrame(rows, columns=["category", "item", "value"])
-# ==== 公開 API：parse_pdf ====
-def parse_pdf(files: List[str], company: str = "", force_ocr: bool = False) -> Tuple[Dict[str, Any], pd.DataFrame]:
-    """
-    入力: PDFファイルパスの配列
-    出力: (抽出JSON辞書, 表編集用DataFrame)
-    方針:
-      - まず PDF→画像化して Vision で抽出（poppler が無い/失敗なら例外）
-      - 画像抽出が失敗したらテキスト抽出→Textモデルで抽出
-      - `force_ocr=True` の場合は常に画像→Vision を試みる
-    """
-    if not files:
-        raise ValueError("PDF が指定されていません。")
-    # 1) 画像化（複数PDFを順に）
     images: List[bytes] = []
-    if force_ocr:
-        for p in files:
-            images += _pdf_to_images(p, dpi=220, max_pages=6)
-    else:
-        # 画像化を試して、ダメならテキストにフォールバック
         try:
-            for p in files:
-                images += _pdf_to_images(p, dpi=220, max_pages=6)
-        except Exception:
-            images = []
-    # 2) Vision / Text のいずれかで抽出
-    try:
-        if images:
-            fin = _extract_with_llm(images, None, company or "")
-        else:
-            # テキスト抽出
-            text_blob = ""
-            for p in files:
-                text_blob += _pdf_to_text(p) + "\n\n"
-            fin = _extract_with_llm(None, text_blob, company or "")
-    except Exception as e:
-        # LLM失敗時も最後にテキスト抽出で最低限の骨格を返す
-        text_blob = ""
-        for p in files:
-            try:
-                text_blob += _pdf_to_text(p) + "\n\n"
-            except Exception:
-                pass
-        fin = {
-            "company": {"name": company or None},
-            "period": {"start_date": None, "end_date": None},
-            "balance_sheet": {"total_assets": None, "total_liabilities": None, "total_equity": None,
-                              "current_assets": None, "fixed_assets": None,
-                              "current_liabilities": None, "long_term_liabilities": None},
-            "income_statement": {"sales": None, "cost_of_sales": None, "gross_profit": None,
-                                 "operating_expenses": None, "operating_income": None,
-                                 "ordinary_income": None, "net_income": None},
-            "cash_flows": {"operating_cash_flow": None, "investing_cash_flow": None, "financing_cash_flow": None},
-            "_fallback_note": f"LLM抽出に失敗したため簡易骨格のみ返却（理由: {type(e).__name__})"
-        }
-    df = _fin_to_df(fin)
-    return fin, df

 # core/extract.py
 from __future__ import annotations
+import io, shutil
+from typing import List, Tuple
 import pdfplumber
 from pdf2image import convert_from_path
+class ExtractError(Exception):
+    pass
+def env_summary() -> str:
+    out = []
+    for b in ("pdftoppm", "pdftocairo"):
+        ok = shutil.which(b) is not None
+        out.append(("✅" if ok else "❌") + f" {b}")
+    return " / ".join(out)
+def _pdf_to_text(path: str, max_chars: int = 20000) -> Tuple[str, str]:
+    """pdfminer ベースの純テキスト抽出（速い）"""
+    log = []
+    chunks = []
+    try:
+        with pdfplumber.open(path) as pdf:
+            for i, p in enumerate(pdf.pages):
+                t = (p.extract_text() or "").strip()
+                if t:
+                    chunks.append(f"[page {i+1}]\n{t}")
+                if sum(len(c) for c in chunks) >= max_chars:
+                    break
+        txt = "\n\n".join(chunks)[:max_chars]
+        log.append(f"pdfplumber text length={len(txt)}")
+        return txt, "\n".join(log)
+    except Exception as e:
+        log.append(f"pdfplumber error: {type(e).__name__}: {e}")
+        return "", "\n".join(log)
+def _pick_business_text(raw_text: str) -> str:
+    """事業説明/会社概要っぽい段落を拾う（AI補足用）"""
+    keys = ("事業内容", "会社概要", "製品", "サービス", "沿革")
+    best = ""
+    for block in raw_text.split("\n\n"):
+        if any(k in block for k in keys):
+            best = block if len(block) > len(best) else best
+    return (best or raw_text[:1200])
+def parse_pdf(
+    file_paths: List[str],
+    force_ocr: bool = False,
+    dpi: int = 220,
+    max_pages: int = 8
+) -> Tuple[List[bytes], str, str, str]:
     """
+    Returns:
+      images     : Vision へ渡せる PNG バイト列（最大 max_pages）
+      raw_text   : テキスト抽出結果（テキストモデルのフォールバック用）
+      business   : 事業説明に近いテキスト（AI所見の市場/製品補足用）
+      debug_log  : 抽出ログ（UI に表示）
     """
+    if not file_paths:
+        raise ExtractError("PDFが指定されていません。")
+    debug_lines = [f"[env] {env_summary()}"]
+    # ---- まずは全ファイルからテキスト抽出（速い・確実）
+    all_text = []
+    for p in file_paths:
+        txt, lg = _pdf_to_text(p)
+        debug_lines.append(f"[text] {p}: {lg}")
+        all_text.append(txt)
+    raw_text = "\n\n".join(all_text)
+    # ---- 画像化（Vision 用）。テキストが薄い／OCR強制なら実行
     images: List[bytes] = []
+    need_images = force_ocr or (len(raw_text) < 500)
+    if need_images:
         try:
+            for p in file_paths:
+                pages = convert_from_path(p, dpi=dpi, fmt="png")
+                for i, pg in enumerate(pages):
+                    if len(images) >= max_pages:
+                        break
+                    buf = io.BytesIO()
+                    pg.save(buf, format="PNG")
+                    images.append(buf.getvalue())
+            debug_lines.append(f"[image] generated pages: {len(images)}")
+        except Exception as e:
+            # Poppler 未導入や壊れ PDF を丁寧に通知
+            debug_lines.append(f"[image] error: {type(e).__name__}: {e}")
+            if shutil.which("pdftoppm") is None:
+                raise ExtractError(
+                    "PDFの画像化に失敗しました（Poppler 未検出）。"
+                    "Space の packages.txt に `poppler-utils` を入れて再ビルドしてください。"
+                )
+            # 画像化は諦め、テキストのみで続行
+    business = _pick_business_text(raw_text)
+    return images, raw_text, business, "\n".join(debug_lines)