judging / llm_extract.py
Corin1998's picture
Upload 7 files
21ff730 verified
# -*- coding: utf-8 -*-
from __future__ import annotations
import os, json
from typing import List, Optional
from openai import OpenAI
from schemas import FinancialExtract, MultipleSuggestion, MarketOutlook
VISION_MODEL = os.environ.get("OPENAI_VISION_MODEL", "gpt-4o-mini")
TEXT_MODEL = os.environ.get("OPENAI_TEXT_MODEL", "gpt-4o-mini")
def get_client() -> OpenAI:
key = os.environ.get("OPENAI_API_KEY")
if not key:
raise RuntimeError("OPENAI_API_KEY が未設定です。Spaces → Settings → Variables and secrets で追加してください。")
return OpenAI(api_key=key, timeout=120)
def upload_file_to_openai(client: OpenAI, filename: str, file_bytes: bytes) -> str:
from io import BytesIO
bio = BytesIO(file_bytes)
try:
f = client.files.create(file=(filename or "uploaded", bio), purpose="vision")
except Exception:
f = client.files.create(file=(filename or "uploaded", bio), purpose="assistants")
return f.id
def _safe_output_text(resp) -> str:
try:
return resp.output_text
except Exception:
try:
return resp.output[0].content[0].text
except Exception:
return ""
def _strip_code_fences(s: str) -> str:
s2 = s.strip()
if s2.startswith("```"):
s2 = s2.strip("`")
if "\n" in s2: s2 = s2.split("\n", 1)[-1]
if s2.endswith("```"):
s2 = s2[:-3]
return s2.strip()
def _pdf_text_concat(paths: List[str], max_chars: int = 180_000) -> str:
try:
from pypdf import PdfReader
except Exception:
return ""
out, total = [], 0
for p in paths:
try:
r = PdfReader(p)
for page in r.pages:
t = page.extract_text() or ""
if t:
out.append(t)
total += len(t)
if total > max_chars: break
except Exception:
continue
if total > max_chars: break
return "\n\n".join(out)[:max_chars]
def _json_loads_strict(raw: str) -> dict:
import json
try:
return json.loads(raw)
except json.JSONDecodeError:
return json.loads(_strip_code_fences(raw))
def extract_financials_from_files(
client: OpenAI,
file_ids: List[str],
company_hint: Optional[str],
currency_hint: Optional[str],
model: str = VISION_MODEL,
debug: bool = False,
local_paths: Optional[List[str]] = None,
) -> FinancialExtract:
schema = FinancialExtract.model_json_schema()
system = (
"You are a meticulous financial analyst. Prefer consolidated and annual figures. "
"Return strictly valid JSON only."
)
base_user = (
"Extract: company_name, currency, fiscal_year_end, and the latest two periods' numbers "
"(revenue, COGS, EBIT, depreciation, EBITDA, net_income, cash, receivables, inventory, payables, "
"current_assets, current_liabilities, total_assets, total_equity, total_debt, interest_expense)."
)
if company_hint:
base_user += f"\nCompany hint: {company_hint}"
if currency_hint:
base_user += f"\nCurrency hint: {currency_hint}"
try:
resp = client.responses.create(
model=model,
input=[
{"role": "system", "content": [{"type": "input_text", "text": system}]},
{"role": "user",
"content": [{"type": "input_text", "text": base_user}] + [{"type":"input_file","file_id": fid} for fid in file_ids]},
],
max_output_tokens=2048,
)
raw = _safe_output_text(resp)
data = _json_loads_strict(raw)
return FinancialExtract.model_validate(data)
except Exception as e_vision:
if not local_paths:
raise RuntimeError(f"Vision抽出に失敗し、かつローカルPDFテキストがありません: {e_vision}")
text = _pdf_text_concat(local_paths)
if not text:
raise RuntimeError(f"PDFが画像ベースの可能性があり、テキスト抽出できません。Vision側エラー: {e_vision}")
user2 = (
base_user
+ "\n\nSOURCE TEXT (truncated):\n" + text[:150000]
+ "\n\nReturn a single JSON object that VALIDATES this JSON Schema (no extra keys):\n"
+ json.dumps(schema, ensure_ascii=False)
)
resp2 = client.responses.create(
model=os.environ.get("OPENAI_TEXT_MODEL", TEXT_MODEL),
input=[
{"role": "system", "content": [{"type": "input_text", "text": system}]},
{"role": "user", "content": [{"type": "input_text", "text": user2}]},
],
max_output_tokens=2048,
)
raw2 = _safe_output_text(resp2)
if not raw2:
raise RuntimeError("テキスト抽出フォールバックでも応答が空でした。")
data2 = _json_loads_strict(raw2)
return FinancialExtract.model_validate(data2)
def suggest_multiples_with_llm(client: OpenAI, text_model: str, industry: str, region: str = "JP", debug: bool = False) -> Optional[MultipleSuggestion]:
if not industry:
return None
system = "You are an equity analyst. Provide conservative valuation multiple estimates."
user = (
f"Industry: {industry}\nRegion: {region}\n"
"Return STRICT JSON: {\"revenue_multiple\": float, \"ebitda_multiple\": float}"
)
try:
resp = client.responses.create(
model=text_model,
input=[
{"role": "system", "content": [{"type": "input_text", "text": system}]},
{"role": "user", "content": [{"type": "input_text", "text": user}]},
],
max_output_tokens=200,
)
raw = _safe_output_text(resp)
data = _json_loads_strict(raw)
return MultipleSuggestion.model_validate(data)
except Exception:
if debug: raise
return None
def suggest_market_outlook_with_llm(
client: OpenAI,
text_model: str,
industry: str,
market_notes: str,
region: str = "JP",
debug: bool = False,
) -> Optional[MarketOutlook]:
"""市場拡大期待(3-5年)の日本語サマリを返す。"""
if not (industry or market_notes):
return None
system = "日本語で簡潔に出力してください。You are a market analyst. Rate market expansion expectations for the next 3-5 years."
user = (
f"業種: {industry or '不明'}\n地域: {region}\n補足メモ: {market_notes or 'なし'}\n"
"以下のキーのみ、**日本語**で厳格なJSONを返してください。余計な説明は出力しない:\n"
" expectation_label: とても低い / 低い / 中程度 / 高い / とても高い のいずれか\n"
" expectation_score: 整数 1-5\n"
" expected_market_cagr: 小数(%)\n"
" rationale: 60文字以内の日本語要約\n"
)
try:
resp = client.responses.create(
model=text_model,
input=[
{"role": "system", "content": [{"type": "input_text", "text": system}]},
{"role": "user", "content": [{"type": "input_text", "text": user}]},
],
max_output_tokens=250,
)
raw = _safe_output_text(resp)
if not raw:
return None
data = _json_loads_strict(raw)
# 保険:英語キーで来た場合の受け皿
if "expectation_label_ja" in data and "expectation_label" not in data:
data["expectation_label"] = data.pop("expectation_label_ja")
return MarketOutlook.model_validate(data)
except Exception:
if debug: raise
return None