Spaces:

Corin1998
/

judging

Sleeping

App Files Files Community

judging / llm_extract.py

Corin1998

Upload 7 files

21ff730 verified 6 months ago

raw

history blame contribute delete

7.76 kB

	# -- coding: utf-8 --
	from __future__ import annotations
	import os, json
	from typing import List, Optional
	from openai import OpenAI
	from schemas import FinancialExtract, MultipleSuggestion, MarketOutlook

	VISION_MODEL = os.environ.get("OPENAI_VISION_MODEL", "gpt-4o-mini")
	TEXT_MODEL = os.environ.get("OPENAI_TEXT_MODEL", "gpt-4o-mini")

	def get_client() -> OpenAI:
	key = os.environ.get("OPENAI_API_KEY")
	if not key:
	raise RuntimeError("OPENAI_API_KEY が未設定です。Spaces → Settings → Variables and secrets で追加してください。")
	return OpenAI(api_key=key, timeout=120)

	def upload_file_to_openai(client: OpenAI, filename: str, file_bytes: bytes) -> str:
	from io import BytesIO
	bio = BytesIO(file_bytes)
	try:
	f = client.files.create(file=(filename or "uploaded", bio), purpose="vision")
	except Exception:
	f = client.files.create(file=(filename or "uploaded", bio), purpose="assistants")
	return f.id

	def _safe_output_text(resp) -> str:
	try:
	return resp.output_text
	except Exception:
	try:
	return resp.output[0].content[0].text
	except Exception:
	return ""

	def _strip_code_fences(s: str) -> str:
	s2 = s.strip()
	if s2.startswith("```"):
	s2 = s2.strip("`")
	if "\n" in s2: s2 = s2.split("\n", 1)[-1]
	if s2.endswith("```"):
	s2 = s2[:-3]
	return s2.strip()

	def _pdf_text_concat(paths: List[str], max_chars: int = 180_000) -> str:
	try:
	from pypdf import PdfReader
	except Exception:
	return ""
	out, total = [], 0
	for p in paths:
	try:
	r = PdfReader(p)
	for page in r.pages:
	t = page.extract_text() or ""
	if t:
	out.append(t)
	total += len(t)
	if total > max_chars: break
	except Exception:
	continue
	if total > max_chars: break
	return "\n\n".join(out)[:max_chars]

	def _json_loads_strict(raw: str) -> dict:
	import json
	try:
	return json.loads(raw)
	except json.JSONDecodeError:
	return json.loads(_strip_code_fences(raw))

	def extract_financials_from_files(
	client: OpenAI,
	file_ids: List[str],
	company_hint: Optional[str],
	currency_hint: Optional[str],
	model: str = VISION_MODEL,
	debug: bool = False,
	local_paths: Optional[List[str]] = None,
	) -> FinancialExtract:

	schema = FinancialExtract.model_json_schema()
	system = (
	"You are a meticulous financial analyst. Prefer consolidated and annual figures. "
	"Return strictly valid JSON only."
	)
	base_user = (
	"Extract: company_name, currency, fiscal_year_end, and the latest two periods' numbers "
	"(revenue, COGS, EBIT, depreciation, EBITDA, net_income, cash, receivables, inventory, payables, "
	"current_assets, current_liabilities, total_assets, total_equity, total_debt, interest_expense)."
	)
	if company_hint:
	base_user += f"\nCompany hint: {company_hint}"
	if currency_hint:
	base_user += f"\nCurrency hint: {currency_hint}"

	try:
	resp = client.responses.create(
	model=model,
	input=[
	{"role": "system", "content": [{"type": "input_text", "text": system}]},
	{"role": "user",
	"content": [{"type": "input_text", "text": base_user}] + [{"type":"input_file","file_id": fid} for fid in file_ids]},
	],
	max_output_tokens=2048,
	)
	raw = _safe_output_text(resp)
	data = _json_loads_strict(raw)
	return FinancialExtract.model_validate(data)
	except Exception as e_vision:
	if not local_paths:
	raise RuntimeError(f"Vision抽出に失敗し、かつローカルPDFテキストがありません: {e_vision}")

	text = _pdf_text_concat(local_paths)
	if not text:
	raise RuntimeError(f"PDFが画像ベースの可能性があり、テキスト抽出できません。Vision側エラー: {e_vision}")

	user2 = (
	base_user
	+ "\n\nSOURCE TEXT (truncated):\n" + text[:150000]
	+ "\n\nReturn a single JSON object that VALIDATES this JSON Schema (no extra keys):\n"
	+ json.dumps(schema, ensure_ascii=False)
	)
	resp2 = client.responses.create(
	model=os.environ.get("OPENAI_TEXT_MODEL", TEXT_MODEL),
	input=[
	{"role": "system", "content": [{"type": "input_text", "text": system}]},
	{"role": "user", "content": [{"type": "input_text", "text": user2}]},
	],
	max_output_tokens=2048,
	)
	raw2 = _safe_output_text(resp2)
	if not raw2:
	raise RuntimeError("テキスト抽出フォールバックでも応答が空でした。")
	data2 = _json_loads_strict(raw2)
	return FinancialExtract.model_validate(data2)

	def suggest_multiples_with_llm(client: OpenAI, text_model: str, industry: str, region: str = "JP", debug: bool = False) -> Optional[MultipleSuggestion]:
	if not industry:
	return None
	system = "You are an equity analyst. Provide conservative valuation multiple estimates."
	user = (
	f"Industry: {industry}\nRegion: {region}\n"
	"Return STRICT JSON: {\"revenue_multiple\": float, \"ebitda_multiple\": float}"
	)
	try:
	resp = client.responses.create(
	model=text_model,
	input=[
	{"role": "system", "content": [{"type": "input_text", "text": system}]},
	{"role": "user", "content": [{"type": "input_text", "text": user}]},
	],
	max_output_tokens=200,
	)
	raw = _safe_output_text(resp)
	data = _json_loads_strict(raw)
	return MultipleSuggestion.model_validate(data)
	except Exception:
	if debug: raise
	return None

	def suggest_market_outlook_with_llm(
	client: OpenAI,
	text_model: str,
	industry: str,
	market_notes: str,
	region: str = "JP",
	debug: bool = False,
	) -> Optional[MarketOutlook]:
	"""市場拡大期待（3-5年）の日本語サマリを返す。"""
	if not (industry or market_notes):
	return None

	system = "日本語で簡潔に出力してください。You are a market analyst. Rate market expansion expectations for the next 3-5 years."
	user = (
	f"業種: {industry or '不明'}\n地域: {region}\n補足メモ: {market_notes or 'なし'}\n"
	"以下のキーのみ、日本語で厳格なJSONを返してください。余計な説明は出力しない:\n"
	" expectation_label: とても低い / 低い / 中程度 / 高い / とても高いのいずれか\n"
	" expectation_score: 整数 1-5\n"
	" expected_market_cagr: 小数（%）\n"
	" rationale: 60文字以内の日本語要約\n"
	)

	try:
	resp = client.responses.create(
	model=text_model,
	input=[
	{"role": "system", "content": [{"type": "input_text", "text": system}]},
	{"role": "user", "content": [{"type": "input_text", "text": user}]},
	],
	max_output_tokens=250,
	)
	raw = _safe_output_text(resp)
	if not raw:
	return None
	data = _json_loads_strict(raw)
	# 保険：英語キーで来た場合の受け皿
	if "expectation_label_ja" in data and "expectation_label" not in data:
	data["expectation_label"] = data.pop("expectation_label_ja")
	return MarketOutlook.model_validate(data)
	except Exception:
	if debug: raise
	return None