import pandera as pa from pandera import Column, DataFrameSchema, Check import pandas as pd from pathlib import Path import shutil, tempfile, os, zipfile FIN_REQUIRED = ["year","quarter","revenue","ebit","net_income","total_assets","total_equity"] ESG_REQUIRED = ["year","metric","value","unit","scope","notes"] ALIASES = { "revenue": ["revenue","sales","売上","売上高"], "ebit": ["ebit","operating_income","営業利益"], "net_income": ["net_income","純利益","profit"], "total_equity": ["total_equity","shareholders_equity","自己資本"], } def normalize_columns(df: pd.DataFrame, required: list) -> pd.DataFrame: cols = {c.lower(): c for c in df.columns} # 別名を正規化 for key, names in ALIASES.items(): if key not in df.columns: for n in names: if n in df.columns or n in cols: src = n if n in df.columns else cols.get(n) df = df.rename(columns={src: key}) break missing = [c for c in required if c not in df.columns] if missing: raise ValueError(f"必須列不足: {missing}") return df fin_schema = DataFrameSchema({ "year": Column(int, Check.ge(1900)), "quarter": Column(str), "revenue": Column(float, Check.ge(0)), "ebit": Column(float), "net_income": Column(float), "total_assets": Column(float, nullable=True), "total_equity": Column(float, nullable=True), }) esg_schema = DataFrameSchema({ "year": Column(int, Check.ge(1900)), "metric": Column(str), "value": Column(float), "unit": Column(str, nullable=True), "scope": Column(str, nullable=True), "notes": Column(object, nullable=True), }) def validate_financials(df: pd.DataFrame) -> pd.DataFrame: df = normalize_columns(df, FIN_REQUIRED) return fin_schema.validate(df, lazy=True) def validate_esg(df: pd.DataFrame) -> pd.DataFrame: df = normalize_columns(df, ESG_REQUIRED) return esg_schema.validate(df, lazy=True) def build_or_update_index(zip_path, index_dir="index"): """ アップロードされた Zip を展開して、簡易インデックス(=展開フォルダ)を作るデモ実装。 app.py からは build_or_update_index(rzip, index_dir=...) と呼ばれます。 """ index_dir = Path(index_dir) if index_dir.exists(): shutil.rmtree(index_dir) index_dir.mkdir(parents=True, exist_ok=True) with zipfile.ZipFile(zip_path, "r") as zf: zf.extractall(index_dir) # 本格的なベクトル検索は未実装。ここではファイルリストを返すだけ。 docs = [str(p) for p in index_dir.glob("**/*") if p.is_file()] return docs def answer_with_context(query: str, index_dir="index"): """ デモ実装: インデックス内のファイル名を列挙して返すだけ。 将来はここでベクトル検索→上位文書を取り出し、LLM に投げて回答を組み立てます。 """ index_dir = Path(index_dir) if not index_dir.exists(): return "インデックスが存在しません。まず Zip をアップロードしてください。" files = [p.name for p in index_dir.glob("**/*") if p.is_file()] head = ", ".join(files[:5]) if files else "(なし)" return f"[RAGデモ] 質問: {query}\n参照候補: {head}" # ここでは単純に最初の文書を返す(本来はベクトル検索など) context = docs[0]["content"] if docs else "文書がありません。" if llm: return llm.generate_with_context(query, context) else: return f"【疑似回答】質問: {query}\n関連情報: {context[:200]}..."