|
|
import pandera as pa |
|
|
from pandera import Column, DataFrameSchema, Check |
|
|
import pandas as pd |
|
|
|
|
|
FIN_REQUIRED = ["year","quarter","revenue","ebit","net_income","total_assets","total_equity"] |
|
|
ESG_REQUIRED = ["year","metric","value","unit","scope","notes"] |
|
|
|
|
|
ALIASES = { |
|
|
"revenue": ["revenue","sales","売上","売上高"], |
|
|
"ebit": ["ebit","operating_income","営業利益"], |
|
|
"net_income": ["net_income","純利益","profit"], |
|
|
"total_equity": ["total_equity","shareholders_equity","自己資本"], |
|
|
} |
|
|
|
|
|
def normalize_columns(df: pd.DataFrame, required: list) -> pd.DataFrame: |
|
|
cols = {c.lower(): c for c in df.columns} |
|
|
|
|
|
for key, names in ALIASES.items(): |
|
|
if key not in df.columns: |
|
|
for n in names: |
|
|
if n in df.columns or n in cols: |
|
|
src = n if n in df.columns else cols.get(n) |
|
|
df = df.rename(columns={src: key}) |
|
|
break |
|
|
missing = [c for c in required if c not in df.columns] |
|
|
if missing: |
|
|
raise ValueError(f"必須列不足: {missing}") |
|
|
return df |
|
|
|
|
|
fin_schema = DataFrameSchema({ |
|
|
"year": Column(int, Check.ge(1900)), |
|
|
"quarter": Column(str), |
|
|
"revenue": Column(float, Check.ge(0)), |
|
|
"ebit": Column(float), |
|
|
"net_income": Column(float), |
|
|
"total_assets": Column(float, nullable=True), |
|
|
"total_equity": Column(float, nullable=True), |
|
|
}) |
|
|
|
|
|
esg_schema = DataFrameSchema({ |
|
|
"year": Column(int, Check.ge(1900)), |
|
|
"metric": Column(str), |
|
|
"value": Column(float), |
|
|
"unit": Column(str, nullable=True), |
|
|
"scope": Column(str, nullable=True), |
|
|
"notes": Column(object, nullable=True), |
|
|
}) |
|
|
|
|
|
def validate_financials(df: pd.DataFrame) -> pd.DataFrame: |
|
|
df = normalize_columns(df, FIN_REQUIRED) |
|
|
return fin_schema.validate(df, lazy=True) |
|
|
|
|
|
def validate_esg(df: pd.DataFrame) -> pd.DataFrame: |
|
|
df = normalize_columns(df, ESG_REQUIRED) |
|
|
return esg_schema.validate(df, lazy=True) |
|
|
|