Corin1998 commited on
Commit
c13dc84
·
verified ·
1 Parent(s): 65060db

Create extract.py

Browse files
Files changed (1) hide show
  1. core/extract.py +81 -163
core/extract.py CHANGED
@@ -1,178 +1,96 @@
1
  # core/extract.py
2
  from __future__ import annotations
3
- import os, io, base64, json, shutil
4
- from typing import List, Tuple, Dict, Any
5
-
6
- import pandas as pd
7
  import pdfplumber
8
  from pdf2image import convert_from_path
9
- from openai import OpenAI
10
 
11
- # ==== モデル指定(環境変数で変更可) ====
12
- OPENAI_MODEL_VISION = os.environ.get("OPENAI_VISION_MODEL", "gpt-4o-mini")
13
- OPENAI_MODEL_TEXT = os.environ.get("OPENAI_TEXT_MODEL", "gpt-4o-mini")
14
 
15
- # ==== 共通ユーティリティ ====
16
- def _b64(b: bytes) -> str:
17
- return base64.b64encode(b).decode("utf-8")
 
 
 
18
 
19
- def _client() -> OpenAI:
20
- """
21
- OpenAI公式 SDK v1 系。httpx との互換のため requirements は httpx==0.27.* を推奨。
22
- """
23
- key = os.environ.get("OPENAI_API_KEY")
24
- if not key:
25
- raise RuntimeError("OPENAI_API_KEY が未設定です(Spaces Settings → Variables and secrets)。")
26
- # proxies は渡さない(互換性エラーを避ける)
27
- return OpenAI(api_key=key, timeout=30)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
- # ==== PDF 読み込み ====
30
- def _pdf_to_images(path: str, dpi: int = 220, max_pages: int = 6) -> List[bytes]:
 
 
 
 
31
  """
32
- Poppler系バイナリ(pdftoppm/pdftocairo)が必要です。Spaces なら packages.txt に
33
- `poppler-utils` を入れておくと安定します。
 
 
 
34
  """
35
- imgs: List[bytes] = []
36
- pages = convert_from_path(path, dpi=dpi, fmt="png")
37
- for i, p in enumerate(pages):
38
- if i >= max_pages:
39
- break
40
- buf = io.BytesIO()
41
- p.save(buf, format="PNG")
42
- imgs.append(buf.getvalue())
43
- return imgs
44
-
45
- def _pdf_to_text(path: str, max_chars: int = 15000) -> str:
46
- out: List[str] = []
47
- with pdfplumber.open(path) as pdf:
48
- for i, page in enumerate(pdf.pages):
49
- t = (page.extract_text() or "").strip()
50
- if t:
51
- out.append(f"[page {i+1}]\n{t}")
52
- if sum(len(x) for x in out) > max_chars:
53
- break
54
- return "\n\n".join(out)[:max_chars]
55
-
56
- # ==== LLM へ渡す JSON 指示 ====
57
- _SYSTEM_JSON = """あなたは有能な財務アナリストです。
58
- 与えられた決算書(画像またはテキスト)から、次の厳密な JSON 構造のみを日本語の単位なし・半角数値で返してください。分からない項目は null。
59
- {
60
- "company": {"name": null},
61
- "period": {"start_date": null, "end_date": null},
62
- "balance_sheet": {
63
- "total_assets": null, "total_liabilities": null, "total_equity": null,
64
- "current_assets": null, "fixed_assets": null,
65
- "current_liabilities": null, "long_term_liabilities": null
66
- },
67
- "income_statement": {
68
- "sales": null, "cost_of_sales": null, "gross_profit": null,
69
- "operating_expenses": null, "operating_income": null,
70
- "ordinary_income": null, "net_income": null
71
- },
72
- "cash_flows": {
73
- "operating_cash_flow": null, "investing_cash_flow": null, "financing_cash_flow": null
74
- }
75
- }
76
- """
77
 
78
- def _extract_with_llm(images: List[bytes] | None, text_blob: str | None, company_hint: str) -> Dict[str, Any]:
79
- client = _client()
80
- if images:
81
- content = [{"type": "text", "text": _SYSTEM_JSON}]
82
- if company_hint:
83
- content.append({"type": "text", "text": f"会社名の候補: {company_hint}"})
84
- for im in images:
85
- content.append({"type": "input_image", "image_url": f"data:image/png;base64,{_b64(im)}"})
86
- resp = client.chat.completions.create(
87
- model=OPENAI_MODEL_VISION,
88
- messages=[
89
- {"role": "system", "content": "返答は必ず有効な JSON オブジェクトのみ。説明を含めない。"},
90
- {"role": "user", "content": content},
91
- ],
92
- response_format={"type": "json_object"},
93
- temperature=0.1,
94
- )
95
- return json.loads(resp.choices[0].message.content)
96
- else:
97
- prompt = f"{_SYSTEM_JSON}\n\n以下は決算書のテキストです。上記の JSON だけを返してください。\n\n{text_blob or ''}"
98
- resp = client.chat.completions.create(
99
- model=OPENAI_MODEL_TEXT,
100
- messages=[
101
- {"role": "system", "content": "返答は必ず有効な JSON オブジェクトのみ。"},
102
- {"role": "user", "content": prompt},
103
- ],
104
- response_format={"type": "json_object"},
105
- temperature=0.1,
106
- )
107
- return json.loads(resp.choices[0].message.content)
108
 
109
- # ==== JSON <-> DataFrame ====
110
- def _fin_to_df(fin: Dict[str, Any]) -> pd.DataFrame:
111
- rows: List[Dict[str, Any]] = []
112
- def add(cat: str, d: Dict[str, Any] | None):
113
- for k, v in (d or {}).items():
114
- rows.append({"category": cat, "item": k, "value": v})
115
- add("balance_sheet", fin.get("balance_sheet"))
116
- add("income_statement", fin.get("income_statement"))
117
- add("cash_flows", fin.get("cash_flows"))
118
- return pd.DataFrame(rows, columns=["category", "item", "value"])
119
 
120
- # ==== 公開 API:parse_pdf ====
121
- def parse_pdf(files: List[str], company: str = "", force_ocr: bool = False) -> Tuple[Dict[str, Any], pd.DataFrame]:
122
- """
123
- 入力: PDFファイルパスの配列
124
- 出力: (抽出JSON辞書, 表編集用DataFrame)
125
- 方針:
126
- - まず PDF→画像化して Vision で抽出(poppler が無い/失敗なら例外)
127
- - 画像抽出が失敗したらテキスト抽出→Textモデルで抽出
128
- - `force_ocr=True` の場合は常に画像→Vision を試みる
129
- """
130
- if not files:
131
- raise ValueError("PDF が指定されていません。")
132
-
133
- # 1) 画像化(複数PDFを順に)
134
  images: List[bytes] = []
135
- if force_ocr:
136
- for p in files:
137
- images += _pdf_to_images(p, dpi=220, max_pages=6)
138
- else:
139
- # 画像化を試して、ダメならテキストにフォールバック
140
  try:
141
- for p in files:
142
- images += _pdf_to_images(p, dpi=220, max_pages=6)
143
- except Exception:
144
- images = []
145
-
146
- # 2) Vision / Text のいずれかで抽出
147
- try:
148
- if images:
149
- fin = _extract_with_llm(images, None, company or "")
150
- else:
151
- # テキスト抽出
152
- text_blob = ""
153
- for p in files:
154
- text_blob += _pdf_to_text(p) + "\n\n"
155
- fin = _extract_with_llm(None, text_blob, company or "")
156
- except Exception as e:
157
- # LLM失敗時も最後にテキスト抽出で最低限の骨格を返す
158
- text_blob = ""
159
- for p in files:
160
- try:
161
- text_blob += _pdf_to_text(p) + "\n\n"
162
- except Exception:
163
- pass
164
- fin = {
165
- "company": {"name": company or None},
166
- "period": {"start_date": None, "end_date": None},
167
- "balance_sheet": {"total_assets": None, "total_liabilities": None, "total_equity": None,
168
- "current_assets": None, "fixed_assets": None,
169
- "current_liabilities": None, "long_term_liabilities": None},
170
- "income_statement": {"sales": None, "cost_of_sales": None, "gross_profit": None,
171
- "operating_expenses": None, "operating_income": None,
172
- "ordinary_income": None, "net_income": None},
173
- "cash_flows": {"operating_cash_flow": None, "investing_cash_flow": None, "financing_cash_flow": None},
174
- "_fallback_note": f"LLM抽出に失敗したため簡易骨格のみ返却(理由: {type(e).__name__})"
175
- }
176
-
177
- df = _fin_to_df(fin)
178
- return fin, df
 
1
  # core/extract.py
2
  from __future__ import annotations
3
+ import io, shutil
4
+ from typing import List, Tuple
 
 
5
  import pdfplumber
6
  from pdf2image import convert_from_path
 
7
 
8
+ class ExtractError(Exception):
9
+ pass
 
10
 
11
+ def env_summary() -> str:
12
+ out = []
13
+ for b in ("pdftoppm", "pdftocairo"):
14
+ ok = shutil.which(b) is not None
15
+ out.append(("✅" if ok else "❌") + f" {b}")
16
+ return " / ".join(out)
17
 
18
+ def _pdf_to_text(path: str, max_chars: int = 20000) -> Tuple[str, str]:
19
+ """pdfminer ベースの純テキスト抽出(速い)"""
20
+ log = []
21
+ chunks = []
22
+ try:
23
+ with pdfplumber.open(path) as pdf:
24
+ for i, p in enumerate(pdf.pages):
25
+ t = (p.extract_text() or "").strip()
26
+ if t:
27
+ chunks.append(f"[page {i+1}]\n{t}")
28
+ if sum(len(c) for c in chunks) >= max_chars:
29
+ break
30
+ txt = "\n\n".join(chunks)[:max_chars]
31
+ log.append(f"pdfplumber text length={len(txt)}")
32
+ return txt, "\n".join(log)
33
+ except Exception as e:
34
+ log.append(f"pdfplumber error: {type(e).__name__}: {e}")
35
+ return "", "\n".join(log)
36
+
37
+ def _pick_business_text(raw_text: str) -> str:
38
+ """事業説明/会社概要っぽい段落を拾う(AI補足用)"""
39
+ keys = ("事業内容", "会社概要", "製品", "サービス", "沿革")
40
+ best = ""
41
+ for block in raw_text.split("\n\n"):
42
+ if any(k in block for k in keys):
43
+ best = block if len(block) > len(best) else best
44
+ return (best or raw_text[:1200])
45
 
46
+ def parse_pdf(
47
+ file_paths: List[str],
48
+ force_ocr: bool = False,
49
+ dpi: int = 220,
50
+ max_pages: int = 8
51
+ ) -> Tuple[List[bytes], str, str, str]:
52
  """
53
+ Returns:
54
+ images : Vision へ渡せる PNG バイト列(最大 max_pages)
55
+ raw_text : テキスト抽出結果(テキストモデルのフォールバック用)
56
+ business : 事業説明に近いテキスト(AI所見の市場/製品補足用)
57
+ debug_log : 抽出ログ(UI に表示)
58
  """
59
+ if not file_paths:
60
+ raise ExtractError("PDFが指定されていません。")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
+ debug_lines = [f"[env] {env_summary()}"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
+ # ---- まずは全ファイルからテキスト抽出(速い・確実)
65
+ all_text = []
66
+ for p in file_paths:
67
+ txt, lg = _pdf_to_text(p)
68
+ debug_lines.append(f"[text] {p}: {lg}")
69
+ all_text.append(txt)
70
+ raw_text = "\n\n".join(all_text)
 
 
 
71
 
72
+ # ---- 画像化(Vision 用)。テキストが薄い/OCR強制なら実行
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  images: List[bytes] = []
74
+ need_images = force_ocr or (len(raw_text) < 500)
75
+ if need_images:
 
 
 
76
  try:
77
+ for p in file_paths:
78
+ pages = convert_from_path(p, dpi=dpi, fmt="png")
79
+ for i, pg in enumerate(pages):
80
+ if len(images) >= max_pages:
81
+ break
82
+ buf = io.BytesIO()
83
+ pg.save(buf, format="PNG")
84
+ images.append(buf.getvalue())
85
+ debug_lines.append(f"[image] generated pages: {len(images)}")
86
+ except Exception as e:
87
+ # Poppler 未導入や壊れ PDF を丁寧に通知
88
+ debug_lines.append(f"[image] error: {type(e).__name__}: {e}")
89
+ if shutil.which("pdftoppm") is None:
90
+ raise ExtractError(
91
+ "PDFの画像化に失敗しました(Poppler 未検出)。"
92
+ "Space packages.txt に `poppler-utils` を入れて再ビルドしてください。"
93
+ )
94
+ # 画像化は諦め、テキストのみで続行
95
+ business = _pick_business_text(raw_text)
96
+ return images, raw_text, business, "\n".join(debug_lines)